From c6b5db1d591f0984cd0e6918970a9e4fc32595d3 Mon Sep 17 00:00:00 2001 From: furszy Date: Tue, 3 Sep 2024 11:00:55 -0300 Subject: [PATCH] assumeUTXO: fix peers disconnection during sync Because AssumeUTXO nodes prioritize tip synchronization, they relay their local address through the network before completing the background chain sync. This, combined with the advertising of full-node service (NODE_NETWORK), can result in an honest peer in IBD connecting to the AssumeUTXO node (while syncing) and requesting an historical block the node does not have. This behavior leads to an abrupt disconnection due to perceived unresponsiveness (lack of response) from the AssumeUTXO node. This lack of response occurs because nodes ignore getdata requests when they do not have the block data available (further discussion can be found in PR 30385). Fix this by refraining from signaling full-node service support while the background chain is being synced. During this period, the node will only signal 'NODE_NETWORK_LIMITED' support. Then, full-node ('NODE_NETWORK') support will be re-enabled once the background chain sync is completed. Github-Pull: bitcoin/bitcoin#30807 Rebased-From: 6d5812e5c852c233bd7ead2ceef051f8567619ed --- src/init.cpp | 16 +++++++++--- src/net.cpp | 5 ++-- src/net.h | 10 ++++++-- src/rpc/blockchain.cpp | 7 ++++++ src/validation.cpp | 4 +-- src/validation.h | 2 +- test/functional/feature_assumeutxo.py | 36 +++++++++++++++++++++++++++ 7 files changed, 70 insertions(+), 10 deletions(-) diff --git a/src/init.cpp b/src/init.cpp index faaf3353d07..2572f9d78c4 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -1558,7 +1558,12 @@ bool AppInitMain(NodeContext& node, interfaces::BlockAndHeaderTipInfo* tip_info) // This is defined and set here instead of inline in validation.h to avoid a hard // dependency between validation and index/base, since the latter is not in // libbitcoinkernel. - chainman.restart_indexes = [&node]() { + chainman.snapshot_download_completed = [&node]() { + if (!node.chainman->m_blockman.IsPruneMode()) { + LogPrintf("[snapshot] re-enabling NODE_NETWORK services\n"); + node.connman->AddLocalServices(NODE_NETWORK); + } + LogPrintf("[snapshot] restarting indexes\n"); // Drain the validation interface queue to ensure that the old indexes @@ -1695,8 +1700,13 @@ bool AppInitMain(NodeContext& node, interfaces::BlockAndHeaderTipInfo* tip_info) } } } else { - LogPrintf("Setting NODE_NETWORK on non-prune mode\n"); - nLocalServices = ServiceFlags(nLocalServices | NODE_NETWORK); + // Prior to setting NODE_NETWORK, check if we can provide historical blocks. + if (!WITH_LOCK(chainman.GetMutex(), return chainman.BackgroundSyncInProgress())) { + LogPrintf("Setting NODE_NETWORK on non-prune mode\n"); + nLocalServices = ServiceFlags(nLocalServices | NODE_NETWORK); + } else { + LogPrintf("Running node in NODE_NETWORK_LIMITED mode until snapshot background sync completes\n"); + } } // ********************************************************* Step 11: import blocks diff --git a/src/net.cpp b/src/net.cpp index e1206745a42..05b98d1029b 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -1789,7 +1789,8 @@ void CConnman::CreateNodeFromAcceptedSocket(std::unique_ptr&& sock, const bool inbound_onion = std::find(m_onion_binds.begin(), m_onion_binds.end(), addr_bind) != m_onion_binds.end(); // The V2Transport transparently falls back to V1 behavior when an incoming V1 connection is // detected, so use it whenever we signal NODE_P2P_V2. - const bool use_v2transport(nLocalServices & NODE_P2P_V2); + ServiceFlags local_services = GetLocalServices(); + const bool use_v2transport(local_services & NODE_P2P_V2); CNode* pnode = new CNode(id, std::move(sock), @@ -1807,7 +1808,7 @@ void CConnman::CreateNodeFromAcceptedSocket(std::unique_ptr&& sock, .use_v2transport = use_v2transport, }); pnode->AddRef(); - m_msgproc->InitializeNode(*pnode, nLocalServices); + m_msgproc->InitializeNode(*pnode, local_services); { LOCK(m_nodes_mutex); m_nodes.push_back(pnode); diff --git a/src/net.h b/src/net.h index beec58c3898..b04b73e696f 100644 --- a/src/net.h +++ b/src/net.h @@ -1221,6 +1221,11 @@ public: //! that peer during `net_processing.cpp:PushNodeVersion()`. ServiceFlags GetLocalServices() const; + //! Updates the local services that this node advertises to other peers + //! during connection handshake. + void AddLocalServices(ServiceFlags services) { nLocalServices = ServiceFlags(nLocalServices | services); }; + void RemoveLocalServices(ServiceFlags services) { nLocalServices = ServiceFlags(nLocalServices & ~services); } + uint64_t GetMaxOutboundTarget() const EXCLUSIVE_LOCKS_REQUIRED(!m_total_bytes_sent_mutex); std::chrono::seconds GetMaxOutboundTimeframe() const; @@ -1460,11 +1465,12 @@ private: * This data is replicated in each Peer instance we create. * * This data is not marked const, but after being set it should not - * change. + * change. Unless AssumeUTXO is started, in which case, the peer + * will be limited until the background chain sync finishes. * * \sa Peer::our_services */ - ServiceFlags nLocalServices; + std::atomic nLocalServices; std::unique_ptr semOutbound; std::unique_ptr semAddnode; diff --git a/src/rpc/blockchain.cpp b/src/rpc/blockchain.cpp index b449444aff4..8b8576ab88d 100644 --- a/src/rpc/blockchain.cpp +++ b/src/rpc/blockchain.cpp @@ -2866,6 +2866,13 @@ static RPCHelpMan loadtxoutset() throw JSONRPCError(RPC_INTERNAL_ERROR, strprintf("Unable to load UTXO snapshot: %s. (%s)", util::ErrorString(activation_result).original, path.utf8string())); } + // Because we can't provide historical blocks during tip or background sync. + // Update local services to reflect we are a limited peer until we are fully sync. + node.connman->RemoveLocalServices(NODE_NETWORK); + // Setting the limited state is usually redundant because the node can always + // provide the last 288 blocks, but it doesn't hurt to set it. + node.connman->AddLocalServices(NODE_NETWORK_LIMITED); + CBlockIndex& snapshot_index{*CHECK_NONFATAL(*activation_result)}; UniValue result(UniValue::VOBJ); diff --git a/src/validation.cpp b/src/validation.cpp index 8f75b2e30a0..a6b7f3d3614 100644 --- a/src/validation.cpp +++ b/src/validation.cpp @@ -3574,8 +3574,8 @@ bool Chainstate::ActivateBestChain(BlockValidationState& state, std::shared_ptr< // // This cannot be done while holding cs_main (within // MaybeCompleteSnapshotValidation) or a cs_main deadlock will occur. - if (m_chainman.restart_indexes) { - m_chainman.restart_indexes(); + if (m_chainman.snapshot_download_completed) { + m_chainman.snapshot_download_completed(); } break; } diff --git a/src/validation.h b/src/validation.h index f905d6e6240..bc9bacf7ceb 100644 --- a/src/validation.h +++ b/src/validation.h @@ -976,7 +976,7 @@ public: //! Function to restart active indexes; set dynamically to avoid a circular //! dependency on `base/index.cpp`. - std::function restart_indexes = std::function(); + std::function snapshot_download_completed = std::function(); const CChainParams& GetParams() const { return m_options.chainparams; } const Consensus::Params& GetConsensus() const { return m_options.chainparams.GetConsensus(); } diff --git a/test/functional/feature_assumeutxo.py b/test/functional/feature_assumeutxo.py index a212704311d..05899107957 100755 --- a/test/functional/feature_assumeutxo.py +++ b/test/functional/feature_assumeutxo.py @@ -247,6 +247,11 @@ class AssumeutxoTest(BitcoinTestFramework): node1.submitheader(main_block1) node1.submitheader(main_block2) + def assert_only_network_limited_service(self, node): + node_services = node.getnetworkinfo()['localservicesnames'] + assert 'NETWORK' not in node_services + assert 'NETWORK_LIMITED' in node_services + def run_test(self): """ Bring up two (disconnected) nodes, mine some new blocks on the first, @@ -343,6 +348,9 @@ class AssumeutxoTest(BitcoinTestFramework): self.test_snapshot_block_invalidated(dump_output['path']) self.test_snapshot_not_on_most_work_chain(dump_output['path']) + # Prune-node sanity check + assert 'NETWORK' not in n1.getnetworkinfo()['localservicesnames'] + self.log.info(f"Loading snapshot into second node from {dump_output['path']}") # This node's tip is on an ancestor block of the snapshot, which should # be the normal case @@ -350,6 +358,10 @@ class AssumeutxoTest(BitcoinTestFramework): assert_equal(loaded['coins_loaded'], SNAPSHOT_BASE_HEIGHT) assert_equal(loaded['base_height'], SNAPSHOT_BASE_HEIGHT) + self.log.info("Confirm that local services remain unchanged") + # Since n1 is a pruned node, the 'NETWORK' service flag must always be unset. + self.assert_only_network_limited_service(n1) + self.log.info("Check that UTXO-querying RPCs operate on snapshot chainstate") snapshot_hash = loaded['tip_hash'] snapshot_num_coins = loaded['coins_loaded'] @@ -453,6 +465,9 @@ class AssumeutxoTest(BitcoinTestFramework): self.restart_node(1, extra_args=[ f"-stopatheight={PAUSE_HEIGHT}", *self.extra_args[1]]) + # Upon restart during snapshot tip sync, the node must remain in 'limited' mode. + self.assert_only_network_limited_service(n1) + # Finally connect the nodes and let them sync. # # Set `wait_for_connect=False` to avoid a race between performing connection @@ -469,6 +484,9 @@ class AssumeutxoTest(BitcoinTestFramework): self.log.info("Restarted node before snapshot validation completed, reloading...") self.restart_node(1, extra_args=self.extra_args[1]) + # Upon restart, the node must remain in 'limited' mode + self.assert_only_network_limited_service(n1) + # Send snapshot block to n1 out of order. This makes the test less # realistic because normally the snapshot block is one of the last # blocks downloaded, but its useful to test because it triggers more @@ -487,6 +505,10 @@ class AssumeutxoTest(BitcoinTestFramework): self.log.info("Ensuring background validation completes") self.wait_until(lambda: len(n1.getchainstates()['chainstates']) == 1) + # Since n1 is a pruned node, it will not signal NODE_NETWORK after + # completing the background sync. + self.assert_only_network_limited_service(n1) + # Ensure indexes have synced. completed_idx_state = { 'basic block filter index': COMPLETE_IDX, @@ -517,12 +539,18 @@ class AssumeutxoTest(BitcoinTestFramework): self.log.info("-- Testing all indexes + reindex") assert_equal(n2.getblockcount(), START_HEIGHT) + assert 'NETWORK' in n2.getnetworkinfo()['localservicesnames'] # sanity check self.log.info(f"Loading snapshot into third node from {dump_output['path']}") loaded = n2.loadtxoutset(dump_output['path']) assert_equal(loaded['coins_loaded'], SNAPSHOT_BASE_HEIGHT) assert_equal(loaded['base_height'], SNAPSHOT_BASE_HEIGHT) + # Even though n2 is a full node, it will unset the 'NETWORK' service flag during snapshot loading. + # This indicates other peers that the node will temporarily not provide historical blocks. + self.log.info("Check node2 updated the local services during snapshot load") + self.assert_only_network_limited_service(n2) + for reindex_arg in ['-reindex=1', '-reindex-chainstate=1']: self.log.info(f"Check that restarting with {reindex_arg} will delete the snapshot chainstate") self.restart_node(2, extra_args=[reindex_arg, *self.extra_args[2]]) @@ -546,6 +574,11 @@ class AssumeutxoTest(BitcoinTestFramework): msg = "Unable to load UTXO snapshot: Can't activate a snapshot-based chainstate more than once" assert_raises_rpc_error(-32603, msg, n2.loadtxoutset, dump_output['path']) + # Upon restart, the node must stay in 'limited' mode until the background + # chain sync completes. + self.restart_node(2, extra_args=self.extra_args[2]) + self.assert_only_network_limited_service(n2) + self.connect_nodes(0, 2) self.wait_until(lambda: n2.getchainstates()['chainstates'][-1]['blocks'] == FINAL_HEIGHT) self.sync_blocks(nodes=(n0, n2)) @@ -553,6 +586,9 @@ class AssumeutxoTest(BitcoinTestFramework): self.log.info("Ensuring background validation completes") self.wait_until(lambda: len(n2.getchainstates()['chainstates']) == 1) + # Once background chain sync completes, the full node must start offering historical blocks again. + assert {'NETWORK', 'NETWORK_LIMITED'}.issubset(n2.getnetworkinfo()['localservicesnames']) + completed_idx_state = { 'basic block filter index': COMPLETE_IDX, 'coinstatsindex': COMPLETE_IDX,