Merge bitcoin/bitcoin#27866: blockstorage: Return on fatal flush errors

d8041d4e04 blockstorage: Return on fatal undo file flush error (TheCharlatan)
f0207e0030 blockstorage: Return on fatal block file flush error (TheCharlatan)
5671c15f45 blockstorage: Mark FindBlockPos as nodiscard (TheCharlatan)

Pull request description:

  The goal of this PR is to establish that fatal blockstorage flush errors should be treated as errors at their call site.

  Prior to this patch `FlushBlockFile` may have failed without returning in `Chainstate::FlushStateToDisk`, leading to a potential write from `WriteBlockIndexDB` that may refer to a block that is not fully flushed to disk yet. By returning if either `FlushUndoFile` or `FlushBlockFile` fail, we ensure that no further write operations take place that may lead to an inconsistent database when crashing. Add `[[nodiscard]]` annotations to them such that they are not ignored in future.

  Functions that call either `FlushUndoFile` or `FlushBlockFile`, need to handle these extra abort cases properly. Since `Chainstate::FlushStateToDisk` already produces an abort error in case of `WriteBlockIndexDB` failing, no extra logic for functions calling `Chainstate::FlushStateToDisk` is required.

  Besides `Chainstate::FlushStateToDisk`, `FlushBlockFile` is also called by `FindBlockPos`, while `FlushUndoFile` is only called by `FlushBlockFile` and `WriteUndoDataForBlock`. For both these cases, the flush error is not further bubbled up. Instead, the error is logged and a comment is provided why bubbling up an error would be less desirable in these cases.

  ---

  This pull request is part of a larger effort towards improving the shutdown / abort / fatal error handling in validation code. It is a first step towards implementing proper fatal error return type enforcement similar as proposed by theuni in this pull request [comment](https://github.com/bitcoin/bitcoin/pull/27711#issuecomment-1563561502). For ease of review of these critical changes, a first step would be checking that `AbortNode` leads to early and error-conveying returns at its call site. Further work for enforcing returns when `AbortNode` is called is done in https://github.com/bitcoin/bitcoin/pull/27862.

ACKs for top commit:
  stickies-v:
    re-ACK d8041d4
  ryanofsky:
    Code review ACK d8041d4e04

Tree-SHA512: 47ade9b873b15e567c8f60ca538d5a0daf32163e1031be3212a3a45eb492b866664b225f2787c9e40f3e0c089140157d8fd1039abc00c7bdfeec1b52ecd7e219
This commit is contained in:
Ryan Ofsky 2023-09-29 12:50:15 -04:00
commit f562856d02
No known key found for this signature in database
GPG key ID: 46800E30FC748A66
3 changed files with 47 additions and 10 deletions

View file

@ -651,16 +651,19 @@ bool BlockManager::UndoReadFromDisk(CBlockUndo& blockundo, const CBlockIndex& in
return true;
}
void BlockManager::FlushUndoFile(int block_file, bool finalize)
bool BlockManager::FlushUndoFile(int block_file, bool finalize)
{
FlatFilePos undo_pos_old(block_file, m_blockfile_info[block_file].nUndoSize);
if (!UndoFileSeq().Flush(undo_pos_old, finalize)) {
m_opts.notifications.flushError("Flushing undo file to disk failed. This is likely the result of an I/O error.");
return false;
}
return true;
}
void BlockManager::FlushBlockFile(bool fFinalize, bool finalize_undo)
bool BlockManager::FlushBlockFile(bool fFinalize, bool finalize_undo)
{
bool success = true;
LOCK(cs_LastBlockFile);
if (m_blockfile_info.size() < 1) {
@ -668,17 +671,23 @@ void BlockManager::FlushBlockFile(bool fFinalize, bool finalize_undo)
// chainstate init, when we call ChainstateManager::MaybeRebalanceCaches() (which
// then calls FlushStateToDisk()), resulting in a call to this function before we
// have populated `m_blockfile_info` via LoadBlockIndexDB().
return;
return true;
}
assert(static_cast<int>(m_blockfile_info.size()) > m_last_blockfile);
FlatFilePos block_pos_old(m_last_blockfile, m_blockfile_info[m_last_blockfile].nSize);
if (!BlockFileSeq().Flush(block_pos_old, fFinalize)) {
m_opts.notifications.flushError("Flushing block file to disk failed. This is likely the result of an I/O error.");
success = false;
}
// we do not always flush the undo file, as the chain tip may be lagging behind the incoming blocks,
// e.g. during IBD or a sync after a node going offline
if (!fFinalize || finalize_undo) FlushUndoFile(m_last_blockfile, finalize_undo);
if (!fFinalize || finalize_undo) {
if (!FlushUndoFile(m_last_blockfile, finalize_undo)) {
success = false;
}
}
return success;
}
uint64_t BlockManager::CalculateCurrentUsage()
@ -771,7 +780,19 @@ bool BlockManager::FindBlockPos(FlatFilePos& pos, unsigned int nAddSize, unsigne
if (!fKnown) {
LogPrint(BCLog::BLOCKSTORAGE, "Leaving block file %i: %s\n", m_last_blockfile, m_blockfile_info[m_last_blockfile].ToString());
}
FlushBlockFile(!fKnown, finalize_undo);
// Do not propagate the return code. The flush concerns a previous block
// and undo file that has already been written to. If a flush fails
// here, and we crash, there is no expected additional block data
// inconsistency arising from the flush failure here. However, the undo
// data may be inconsistent after a crash if the flush is called during
// a reindex. A flush error might also leave some of the data files
// untrimmed.
if (!FlushBlockFile(!fKnown, finalize_undo)) {
LogPrintLevel(BCLog::BLOCKSTORAGE, BCLog::Level::Warning,
"Failed to flush previous block file %05i (finalize=%i, finalize_undo=%i) before opening new block file %05i\n",
m_last_blockfile, !fKnown, finalize_undo, nFile);
}
m_last_blockfile = nFile;
m_undo_height_in_last_blockfile = 0; // No undo data yet in the new file, so reset our undo-height tracking.
}
@ -862,7 +883,14 @@ bool BlockManager::WriteUndoDataForBlock(const CBlockUndo& blockundo, BlockValid
// with the block writes (usually when a synced up node is getting newly mined blocks) -- this case is caught in
// the FindBlockPos function
if (_pos.nFile < m_last_blockfile && static_cast<uint32_t>(block.nHeight) == m_blockfile_info[_pos.nFile].nHeightLast) {
FlushUndoFile(_pos.nFile, true);
// Do not propagate the return code, a failed flush here should not
// be an indication for a failed write. If it were propagated here,
// the caller would assume the undo data not to be written, when in
// fact it is. Note though, that a failed flush might leave the data
// file untrimmed.
if (!FlushUndoFile(_pos.nFile, true)) {
LogPrintLevel(BCLog::BLOCKSTORAGE, BCLog::Level::Warning, "Failed to flush undo file %05i\n", _pos.nFile);
}
} else if (_pos.nFile == m_last_blockfile && static_cast<uint32_t>(block.nHeight) > m_undo_height_in_last_blockfile) {
m_undo_height_in_last_blockfile = block.nHeight;
}

View file

@ -119,9 +119,14 @@ private:
*/
bool LoadBlockIndex()
EXCLUSIVE_LOCKS_REQUIRED(cs_main);
void FlushBlockFile(bool fFinalize = false, bool finalize_undo = false);
void FlushUndoFile(int block_file, bool finalize = false);
bool FindBlockPos(FlatFilePos& pos, unsigned int nAddSize, unsigned int nHeight, uint64_t nTime, bool fKnown);
/** Return false if block file or undo file flushing fails. */
[[nodiscard]] bool FlushBlockFile(bool fFinalize = false, bool finalize_undo = false);
/** Return false if undo file flushing fails. */
[[nodiscard]] bool FlushUndoFile(int block_file, bool finalize = false);
[[nodiscard]] bool FindBlockPos(FlatFilePos& pos, unsigned int nAddSize, unsigned int nHeight, uint64_t nTime, bool fKnown);
bool FindUndoPos(BlockValidationState& state, int nFile, FlatFilePos& pos, unsigned int nAddSize);
FlatFileSeq BlockFileSeq() const;

View file

@ -2594,7 +2594,11 @@ bool Chainstate::FlushStateToDisk(
LOG_TIME_MILLIS_WITH_CATEGORY("write block and undo data to disk", BCLog::BENCH);
// First make sure all block and undo data is flushed to disk.
m_blockman.FlushBlockFile();
// TODO: Handle return error, or add detailed comment why it is
// safe to not return an error upon failure.
if (!m_blockman.FlushBlockFile()) {
LogPrintLevel(BCLog::VALIDATION, BCLog::Level::Warning, "%s: Failed to flush block file.\n", __func__);
}
}
// Then update all block file information (which may refer to block and undo files).