validation: add ChainMan logic for completing UTXO snapshot validation

Trigger completion when a background validation chainstate reaches the
same height as a UTXO snapshot, and handle cleaning up the chainstate
on subsequent startup.
This commit is contained in:
James O'Beirne 2022-04-28 10:23:33 -04:00
parent f2a4f3376f
commit d96c59cc5c
3 changed files with 421 additions and 3 deletions

View file

@ -85,6 +85,9 @@ static ChainstateLoadResult CompleteChainstateInitialization(
return options.reindex || options.reindex_chainstate || chainstate->CoinsTip().GetBestBlock().IsNull();
};
assert(chainman.m_total_coinstip_cache > 0);
assert(chainman.m_total_coinsdb_cache > 0);
// Conservative value which is arbitrarily chosen, as it will ultimately be changed
// by a call to `chainman.MaybeRebalanceCaches()`. We just need to make sure
// that the sum of the two caches (40%) does not exceed the allowable amount
@ -183,6 +186,47 @@ ChainstateLoadResult LoadChainstate(ChainstateManager& chainman, const CacheSize
return {init_status, init_error};
}
// If a snapshot chainstate was fully validated by a background chainstate during
// the last run, detect it here and clean up the now-unneeded background
// chainstate.
//
// Why is this cleanup done here (on subsequent restart) and not just when the
// snapshot is actually validated? Because this entails unusual
// filesystem operations to move leveldb data directories around, and that seems
// too risky to do in the middle of normal runtime.
auto snapshot_completion = chainman.MaybeCompleteSnapshotValidation();
if (snapshot_completion == SnapshotCompletionResult::SKIPPED) {
// do nothing; expected case
} else if (snapshot_completion == SnapshotCompletionResult::SUCCESS) {
LogPrintf("[snapshot] cleaning up unneeded background chainstate, then reinitializing\n");
if (!chainman.ValidatedSnapshotCleanup()) {
AbortNode("Background chainstate cleanup failed unexpectedly.");
}
// Because ValidatedSnapshotCleanup() has torn down chainstates with
// ChainstateManager::ResetChainstates(), reinitialize them here without
// duplicating the blockindex work above.
assert(chainman.GetAll().empty());
assert(!chainman.IsSnapshotActive());
assert(!chainman.IsSnapshotValidated());
chainman.InitializeChainstate(options.mempool);
// A reload of the block index is required to recompute setBlockIndexCandidates
// for the fully validated chainstate.
chainman.ActiveChainstate().UnloadBlockIndex();
auto [init_status, init_error] = CompleteChainstateInitialization(chainman, cache_sizes, options);
if (init_status != ChainstateLoadStatus::SUCCESS) {
return {init_status, init_error};
}
} else {
return {ChainstateLoadStatus::FAILURE, _(
"UTXO snapshot failed to validate. "
"Restart to resume normal initial block download, or try loading a different snapshot.")};
}
return {ChainstateLoadStatus::SUCCESS, {}};
}

View file

@ -2845,6 +2845,14 @@ bool Chainstate::ConnectTip(BlockValidationState& state, CBlockIndex* pindexNew,
Ticks<SecondsDouble>(time_total),
Ticks<MillisecondsDouble>(time_total) / num_blocks_total);
// If we are the background validation chainstate, check to see if we are done
// validating the snapshot (i.e. our tip has reached the snapshot's base block).
if (this != &m_chainman.ActiveChainstate()) {
// This call may set `m_disabled`, which is referenced immediately afterwards in
// ActivateBestChain, so that we stop connecting blocks past the snapshot base.
m_chainman.MaybeCompleteSnapshotValidation();
}
connectTrace.BlockConnected(pindexNew, std::move(pthisBlock));
return true;
}
@ -3067,6 +3075,14 @@ bool Chainstate::ActivateBestChain(BlockValidationState& state, std::shared_ptr<
// we use m_chainstate_mutex to enforce mutual exclusion so that only one caller may execute this function at a time
LOCK(m_chainstate_mutex);
// Belt-and-suspenders check that we aren't attempting to advance the background
// chainstate past the snapshot base block.
if (WITH_LOCK(::cs_main, return m_disabled)) {
LogPrintf("m_disabled is set - this chainstate should not be in operation. " /* Continued */
"Please report this as a bug. %s\n", PACKAGE_BUGREPORT);
return false;
}
CBlockIndex *pindexMostWork = nullptr;
CBlockIndex *pindexNewTip = nullptr;
int nStopAtHeight = gArgs.GetIntArg("-stopatheight", DEFAULT_STOPATHEIGHT);
@ -3117,6 +3133,15 @@ bool Chainstate::ActivateBestChain(BlockValidationState& state, std::shared_ptr<
assert(trace.pblock && trace.pindex);
GetMainSignals().BlockConnected(trace.pblock, trace.pindex);
}
// This will have been toggled in
// ActivateBestChainStep -> ConnectTip -> MaybeCompleteSnapshotValidation,
// if at all, so we should catch it here.
//
// Break this do-while to ensure we don't advance past the base snapshot.
if (m_disabled) {
break;
}
} while (!m_chain.Tip() || (starting_tip && CBlockIndexWorkComparator()(m_chain.Tip(), starting_tip)));
if (!blocks_connected) return true;
@ -3137,6 +3162,11 @@ bool Chainstate::ActivateBestChain(BlockValidationState& state, std::shared_ptr<
if (nStopAtHeight && pindexNewTip && pindexNewTip->nHeight >= nStopAtHeight) StartShutdown();
if (WITH_LOCK(::cs_main, return m_disabled)) {
// Background chainstate has reached the snapshot base block, so exit.
break;
}
// We check shutdown only after giving ActivateBestChainStep a chance to run once so that we
// never shutdown before connecting the genesis block during LoadChainTip(). Previously this
// caused an assert() failure during shutdown in such cases as the UTXO DB flushing checks
@ -5046,6 +5076,19 @@ static void FlushSnapshotToDisk(CCoinsViewCache& coins_cache, bool snapshot_load
coins_cache.Flush();
}
struct StopHashingException : public std::exception
{
const char* what() const throw() override
{
return "ComputeUTXOStats interrupted by shutdown.";
}
};
static void SnapshotUTXOHashBreakpoint()
{
if (ShutdownRequested()) throw StopHashingException();
}
bool ChainstateManager::PopulateAndValidateSnapshot(
Chainstate& snapshot_chainstate,
AutoFile& coins_file,
@ -5169,13 +5212,18 @@ bool ChainstateManager::PopulateAndValidateSnapshot(
assert(coins_cache.GetBestBlock() == base_blockhash);
auto breakpoint_fnc = [] { /* TODO insert breakpoint here? */ };
// As above, okay to immediately release cs_main here since no other context knows
// about the snapshot_chainstate.
CCoinsViewDB* snapshot_coinsdb = WITH_LOCK(::cs_main, return &snapshot_chainstate.CoinsDB());
const std::optional<CCoinsStats> maybe_stats = ComputeUTXOStats(CoinStatsHashType::HASH_SERIALIZED, snapshot_coinsdb, m_blockman, breakpoint_fnc);
std::optional<CCoinsStats> maybe_stats;
try {
maybe_stats = ComputeUTXOStats(
CoinStatsHashType::HASH_SERIALIZED, snapshot_coinsdb, m_blockman, SnapshotUTXOHashBreakpoint);
} catch (StopHashingException const&) {
return false;
}
if (!maybe_stats.has_value()) {
LogPrintf("[snapshot] failed to generate coins stats\n");
return false;
@ -5243,6 +5291,149 @@ bool ChainstateManager::PopulateAndValidateSnapshot(
return true;
}
// Currently, this function holds cs_main for its duration, which could be for
// multiple minutes due to the ComputeUTXOStats call. This hold is necessary
// because we need to avoid advancing the background validation chainstate
// farther than the snapshot base block - and this function is also invoked
// from within ConnectTip, i.e. from within ActivateBestChain, so cs_main is
// held anyway.
//
// Eventually (TODO), we could somehow separate this function's runtime from
// maintenance of the active chain, but that will either require
//
// (i) setting `m_disabled` immediately and ensuring all chainstate accesses go
// through IsUsable() checks, or
//
// (ii) giving each chainstate its own lock instead of using cs_main for everything.
SnapshotCompletionResult ChainstateManager::MaybeCompleteSnapshotValidation(
std::function<void(bilingual_str)> shutdown_fnc)
{
AssertLockHeld(cs_main);
if (m_ibd_chainstate.get() == &this->ActiveChainstate() ||
!this->IsUsable(m_snapshot_chainstate.get()) ||
!this->IsUsable(m_ibd_chainstate.get()) ||
!m_ibd_chainstate->m_chain.Tip()) {
// Nothing to do - this function only applies to the background
// validation chainstate.
return SnapshotCompletionResult::SKIPPED;
}
const int snapshot_tip_height = this->ActiveHeight();
const int snapshot_base_height = *Assert(this->GetSnapshotBaseHeight());
const CBlockIndex& index_new = *Assert(m_ibd_chainstate->m_chain.Tip());
if (index_new.nHeight < snapshot_base_height) {
// Background IBD not complete yet.
return SnapshotCompletionResult::SKIPPED;
}
assert(SnapshotBlockhash());
uint256 snapshot_blockhash = *Assert(SnapshotBlockhash());
auto handle_invalid_snapshot = [&]() EXCLUSIVE_LOCKS_REQUIRED(::cs_main) {
bilingual_str user_error = strprintf(_(
"%s failed to validate the -assumeutxo snapshot state. "
"This indicates a hardware problem, or a bug in the software, or a "
"bad software modification that allowed an invalid snapshot to be "
"loaded. As a result of this, the node will shut down and stop using any "
"state that was built on the snapshot, resetting the chain height "
"from %d to %d. On the next "
"restart, the node will resume syncing from %d "
"without using any snapshot data. "
"Please report this incident to %s, including how you obtained the snapshot. "
"The invalid snapshot chainstate has been left on disk in case it is "
"helpful in diagnosing the issue that caused this error."),
PACKAGE_NAME, snapshot_tip_height, snapshot_base_height, snapshot_base_height, PACKAGE_BUGREPORT
);
LogPrintf("[snapshot] !!! %s\n", user_error.original);
LogPrintf("[snapshot] deleting snapshot, reverting to validated chain, and stopping node\n");
m_active_chainstate = m_ibd_chainstate.get();
m_snapshot_chainstate->m_disabled = true;
assert(!this->IsUsable(m_snapshot_chainstate.get()));
assert(this->IsUsable(m_ibd_chainstate.get()));
m_snapshot_chainstate->InvalidateCoinsDBOnDisk();
shutdown_fnc(user_error);
};
if (index_new.GetBlockHash() != snapshot_blockhash) {
LogPrintf("[snapshot] supposed base block %s does not match the " /* Continued */
"snapshot base block %s (height %d). Snapshot is not valid.",
index_new.ToString(), snapshot_blockhash.ToString(), snapshot_base_height);
handle_invalid_snapshot();
return SnapshotCompletionResult::BASE_BLOCKHASH_MISMATCH;
}
assert(index_new.nHeight == snapshot_base_height);
int curr_height = m_ibd_chainstate->m_chain.Height();
assert(snapshot_base_height == curr_height);
assert(snapshot_base_height == index_new.nHeight);
assert(this->IsUsable(m_snapshot_chainstate.get()));
assert(this->GetAll().size() == 2);
CCoinsViewDB& ibd_coins_db = m_ibd_chainstate->CoinsDB();
m_ibd_chainstate->ForceFlushStateToDisk();
auto maybe_au_data = ExpectedAssumeutxo(curr_height, ::Params());
if (!maybe_au_data) {
LogPrintf("[snapshot] assumeutxo data not found for height " /* Continued */
"(%d) - refusing to validate snapshot\n", curr_height);
handle_invalid_snapshot();
return SnapshotCompletionResult::MISSING_CHAINPARAMS;
}
const AssumeutxoData& au_data = *maybe_au_data;
std::optional<CCoinsStats> maybe_ibd_stats;
LogPrintf("[snapshot] computing UTXO stats for background chainstate to validate " /* Continued */
"snapshot - this could take a few minutes\n");
try {
maybe_ibd_stats = ComputeUTXOStats(
CoinStatsHashType::HASH_SERIALIZED,
&ibd_coins_db,
m_blockman,
SnapshotUTXOHashBreakpoint);
} catch (StopHashingException const&) {
return SnapshotCompletionResult::STATS_FAILED;
}
// XXX note that this function is slow and will hold cs_main for potentially minutes.
if (!maybe_ibd_stats) {
LogPrintf("[snapshot] failed to generate stats for validation coins db\n");
// While this isn't a problem with the snapshot per se, this condition
// prevents us from validating the snapshot, so we should shut down and let the
// user handle the issue manually.
handle_invalid_snapshot();
return SnapshotCompletionResult::STATS_FAILED;
}
const auto& ibd_stats = *maybe_ibd_stats;
// Compare the background validation chainstate's UTXO set hash against the hard-coded
// assumeutxo hash we expect.
//
// TODO: For belt-and-suspenders, we could cache the UTXO set
// hash for the snapshot when it's loaded in its chainstate's leveldb. We could then
// reference that here for an additional check.
if (AssumeutxoHash{ibd_stats.hashSerialized} != au_data.hash_serialized) {
LogPrintf("[snapshot] hash mismatch: actual=%s, expected=%s\n",
ibd_stats.hashSerialized.ToString(),
au_data.hash_serialized.ToString());
handle_invalid_snapshot();
return SnapshotCompletionResult::HASH_MISMATCH;
}
LogPrintf("[snapshot] snapshot beginning at %s has been fully validated\n",
snapshot_blockhash.ToString());
m_ibd_chainstate->m_disabled = true;
this->MaybeRebalanceCaches();
return SnapshotCompletionResult::SUCCESS;
}
Chainstate& ChainstateManager::ActiveChainstate() const
{
LOCK(::cs_main);
@ -5367,6 +5558,44 @@ bool IsBIP30Unspendable(const CBlockIndex& block_index)
(block_index.nHeight==91812 && block_index.GetBlockHash() == uint256S("0x00000000000af0aed4792b1acee3d966af36cf5def14935db8de83d6f9306f2f"));
}
void Chainstate::InvalidateCoinsDBOnDisk()
{
AssertLockHeld(::cs_main);
// Should never be called on a non-snapshot chainstate.
assert(m_from_snapshot_blockhash);
auto storage_path_maybe = this->CoinsDB().StoragePath();
// Should never be called with a non-existent storage path.
assert(storage_path_maybe);
fs::path snapshot_datadir = *storage_path_maybe;
// Coins views no longer usable.
m_coins_views.reset();
auto invalid_path = snapshot_datadir + "_INVALID";
std::string dbpath = fs::PathToString(snapshot_datadir);
std::string target = fs::PathToString(invalid_path);
LogPrintf("[snapshot] renaming snapshot datadir %s to %s\n", dbpath, target);
// The invalid snapshot datadir is simply moved and not deleted because we may
// want to do forensics later during issue investigation. The user is instructed
// accordingly in MaybeCompleteSnapshotValidation().
try {
fs::rename(snapshot_datadir, invalid_path);
} catch (const fs::filesystem_error& e) {
auto src_str = fs::PathToString(snapshot_datadir);
auto dest_str = fs::PathToString(invalid_path);
LogPrintf("%s: error renaming file '%s' -> '%s': %s\n",
__func__, src_str, dest_str, e.what());
AbortNode(strprintf(
"Rename of '%s' -> '%s' failed. "
"You should resolve this by manually moving or deleting the invalid "
"snapshot directory %s, otherwise you will encounter the same error again "
"on the next startup.",
src_str, dest_str, src_str));
}
}
const CBlockIndex* ChainstateManager::GetSnapshotBaseBlock() const
{
const auto blockhash_op = this->SnapshotBlockhash();
@ -5379,3 +5608,90 @@ std::optional<int> ChainstateManager::GetSnapshotBaseHeight() const
const CBlockIndex* base = this->GetSnapshotBaseBlock();
return base ? std::make_optional(base->nHeight) : std::nullopt;
}
bool ChainstateManager::ValidatedSnapshotCleanup()
{
AssertLockHeld(::cs_main);
auto get_storage_path = [](auto& chainstate) EXCLUSIVE_LOCKS_REQUIRED(::cs_main) -> std::optional<fs::path> {
if (!(chainstate && chainstate->HasCoinsViews())) {
return {};
}
return chainstate->CoinsDB().StoragePath();
};
std::optional<fs::path> ibd_chainstate_path_maybe = get_storage_path(m_ibd_chainstate);
std::optional<fs::path> snapshot_chainstate_path_maybe = get_storage_path(m_snapshot_chainstate);
if (!this->IsSnapshotValidated()) {
// No need to clean up.
return false;
}
// If either path doesn't exist, that means at least one of the chainstates
// is in-memory, in which case we can't do on-disk cleanup. You'd better be
// in a unittest!
if (!ibd_chainstate_path_maybe || !snapshot_chainstate_path_maybe) {
LogPrintf("[snapshot] snapshot chainstate cleanup cannot happen with " /* Continued */
"in-memory chainstates. You are testing, right?\n");
return false;
}
const auto& snapshot_chainstate_path = *snapshot_chainstate_path_maybe;
const auto& ibd_chainstate_path = *ibd_chainstate_path_maybe;
// Since we're going to be moving around the underlying leveldb filesystem content
// for each chainstate, make sure that the chainstates (and their constituent
// CoinsViews members) have been destructed first.
//
// The caller of this method will be responsible for reinitializing chainstates
// if they want to continue operation.
this->ResetChainstates();
// No chainstates should be considered usable.
assert(this->GetAll().size() == 0);
LogPrintf("[snapshot] deleting background chainstate directory (now unnecessary) (%s)\n",
fs::PathToString(ibd_chainstate_path));
fs::path tmp_old{ibd_chainstate_path + "_todelete"};
auto rename_failed_abort = [](
fs::path p_old,
fs::path p_new,
const fs::filesystem_error& err) {
LogPrintf("%s: error renaming file (%s): %s\n",
__func__, fs::PathToString(p_old), err.what());
AbortNode(strprintf(
"Rename of '%s' -> '%s' failed. "
"Cannot clean up the background chainstate leveldb directory.",
fs::PathToString(p_old), fs::PathToString(p_new)));
};
try {
fs::rename(ibd_chainstate_path, tmp_old);
} catch (const fs::filesystem_error& e) {
rename_failed_abort(ibd_chainstate_path, tmp_old, e);
throw;
}
LogPrintf("[snapshot] moving snapshot chainstate (%s) to " /* Continued */
"default chainstate directory (%s)\n",
fs::PathToString(snapshot_chainstate_path), fs::PathToString(ibd_chainstate_path));
try {
fs::rename(snapshot_chainstate_path, ibd_chainstate_path);
} catch (const fs::filesystem_error& e) {
rename_failed_abort(snapshot_chainstate_path, ibd_chainstate_path, e);
throw;
}
if (!DeleteCoinsDBFromDisk(tmp_old, /*is_snapshot=*/false)) {
// No need to AbortNode because once the unneeded bg chainstate data is
// moved, it will not interfere with subsequent initialization.
LogPrintf("Deletion of %s failed. Please remove it manually, as the " /* Continued */
"directory is now unnecessary.\n",
fs::PathToString(tmp_old));
} else {
LogPrintf("[snapshot] deleted background chainstate directory (%s)\n",
fs::PathToString(ibd_chainstate_path));
}
return true;
}

View file

@ -24,6 +24,7 @@
#include <policy/packages.h>
#include <policy/policy.h>
#include <script/script_error.h>
#include <shutdown.h>
#include <sync.h>
#include <txdb.h>
#include <txmempool.h> // For CTxMemPool::cs
@ -663,6 +664,12 @@ public:
* May not be called with cs_main held. May not be called in a
* validationinterface callback.
*
* Note that if this is called while a snapshot chainstate is active, and if
* it is called on a background chainstate whose tip has reached the base block
* of the snapshot, its execution will take *MINUTES* while it hashes the
* background UTXO set to verify the assumeutxo value the snapshot was activated
* with. `cs_main` will be held during this time.
*
* @returns true unless a system error occurred
*/
bool ActivateBestChain(
@ -784,9 +791,37 @@ private:
std::chrono::microseconds m_last_write{0};
std::chrono::microseconds m_last_flush{0};
/**
* In case of an invalid snapshot, rename the coins leveldb directory so
* that it can be examined for issue diagnosis.
*/
void InvalidateCoinsDBOnDisk() EXCLUSIVE_LOCKS_REQUIRED(::cs_main);
friend ChainstateManager;
};
enum class SnapshotCompletionResult {
SUCCESS,
SKIPPED,
// Expected assumeutxo configuration data is not found for the height of the
// base block.
MISSING_CHAINPARAMS,
// Failed to generate UTXO statistics (to check UTXO set hash) for the background
// chainstate.
STATS_FAILED,
// The UTXO set hash of the background validation chainstate does not match
// the one expected by assumeutxo chainparams.
HASH_MISMATCH,
// The blockhash of the current tip of the background validation chainstate does
// not match the one expected by the snapshot chainstate.
BASE_BLOCKHASH_MISMATCH,
};
/**
* Provides an interface for creating and interacting with one or two
* chainstates: an IBD chainstate generated by downloading blocks, and
@ -984,6 +1019,18 @@ public:
[[nodiscard]] bool ActivateSnapshot(
AutoFile& coins_file, const node::SnapshotMetadata& metadata, bool in_memory);
//! Once the background validation chainstate has reached the height which
//! is the base of the UTXO snapshot in use, compare its coins to ensure
//! they match those expected by the snapshot.
//!
//! If the coins match (expected), then mark the validation chainstate for
//! deletion and continue using the snapshot chainstate as active.
//! Otherwise, revert to using the ibd chainstate and shutdown.
SnapshotCompletionResult MaybeCompleteSnapshotValidation(
std::function<void(bilingual_str)> shutdown_fnc =
[](bilingual_str msg) { AbortNode(msg.original, msg); })
EXCLUSIVE_LOCKS_REQUIRED(::cs_main);
//! The most-work chain.
Chainstate& ActiveChainstate() const;
CChain& ActiveChain() const EXCLUSIVE_LOCKS_REQUIRED(GetMutex()) { return ActiveChainstate().m_chain; }
@ -1091,6 +1138,17 @@ public:
Chainstate& ActivateExistingSnapshot(CTxMemPool* mempool, uint256 base_blockhash)
EXCLUSIVE_LOCKS_REQUIRED(::cs_main);
//! If we have validated a snapshot chain during this runtime, copy its
//! chainstate directory over to the main `chainstate` location, completing
//! validation of the snapshot.
//!
//! If the cleanup succeeds, the caller will need to ensure chainstates are
//! reinitialized, since ResetChainstates() will be called before leveldb
//! directories are moved or deleted.
//!
//! @sa node/chainstate:LoadChainstate()
bool ValidatedSnapshotCleanup() EXCLUSIVE_LOCKS_REQUIRED(::cs_main);
~ChainstateManager();
};