From b6b4235c143eda6f0fa4ed86f2dcd47709b3b6ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc?= Date: Thu, 27 Mar 2025 18:40:08 +0100 Subject: [PATCH] Allocate `P2WSH`/`P2TR`/`P2PK` scripts on stack The current `prevector` size of 28 bytes (chosen to fill the `sizeof(CScript)` aligned size) was introduced in 2015 (https://github.com/bitcoin/bitcoin/pull/6914) before SegWit and TapRoot. However, the increasingly common `P2WSH` and `P2TR` scripts are both 34 bytes, and are forced to use heap (re)allocation rather than efficient inline storage. The core trade-off of this change is to eliminate heap allocations for common 34-36 byte scripts at the cost of increasing the base memory footprint of all `CScript` objects by 8 bytes (while still respecting peak memory usage defined by `-dbcache`). Increasing the `prevector` size allows these scripts to be stored on the stack, avoiding heap allocations, reducing potential memory fragmentation, and improving performance during cache flushes. Massif analysis confirms a lower stable memory usage after flushing, suggesting the elimination of heap allocations outweighs the larger base size for common workloads. Due to memory alignment, increasing the `prevector` size to 36 bytes doesn't change the overall `sizeof(CScript)` compared to an increase to 34 bytes, allowing us to include `P2PK` scripts as well at no additional memory cost. Performance benchmarks for AssumeUTXO load and flush show: - Small dbcache (450MB): ~1% performance penalty due to more frequent flushes - Large dbcache (4500-4500MB+): ~6-7% performance improvement due to fewer heap allocations Full IBD and reindex-chainstate with larger `dbcache` values also show an overall ~3% speedup. Co-authored-by: Ava Chow Co-authored-by: Andrew Toth --- src/bench/checkqueue.cpp | 2 +- src/bench/prevector.cpp | 36 ++++++++++++++--------------- src/script/script.h | 2 +- src/test/script_tests.cpp | 34 +++++++++++++-------------- src/test/validation_flush_tests.cpp | 6 ++--- 5 files changed, 39 insertions(+), 41 deletions(-) diff --git a/src/bench/checkqueue.cpp b/src/bench/checkqueue.cpp index 8134154eb11..72eaa6588ad 100644 --- a/src/bench/checkqueue.cpp +++ b/src/bench/checkqueue.cpp @@ -16,7 +16,7 @@ static const size_t BATCHES = 101; static const size_t BATCH_SIZE = 30; -static const int PREVECTOR_SIZE = 28; +static const int PREVECTOR_SIZE = 36; static const unsigned int QUEUE_BATCH_SIZE = 128; // This Benchmark tests the CheckQueue with a slightly realistic workload, diff --git a/src/bench/prevector.cpp b/src/bench/prevector.cpp index 60c43e86e1b..97e897519d8 100644 --- a/src/bench/prevector.cpp +++ b/src/bench/prevector.cpp @@ -27,22 +27,22 @@ template static void PrevectorDestructor(benchmark::Bench& bench) { bench.batch(2).run([&] { - prevector<28, T> t0; - prevector<28, T> t1; - t0.resize(28); - t1.resize(29); + prevector<36, T> t0; + prevector<36, T> t1; + t0.resize(36); + t1.resize(37); }); } template static void PrevectorClear(benchmark::Bench& bench) { - prevector<28, T> t0; - prevector<28, T> t1; + prevector<36, T> t0; + prevector<36, T> t1; bench.batch(2).run([&] { - t0.resize(28); + t0.resize(36); t0.clear(); - t1.resize(29); + t1.resize(37); t1.clear(); }); } @@ -50,12 +50,12 @@ static void PrevectorClear(benchmark::Bench& bench) template static void PrevectorResize(benchmark::Bench& bench) { - prevector<28, T> t0; - prevector<28, T> t1; + prevector<36, T> t0; + prevector<36, T> t1; bench.batch(4).run([&] { - t0.resize(28); + t0.resize(36); t0.resize(0); - t1.resize(29); + t1.resize(37); t1.resize(0); }); } @@ -64,8 +64,8 @@ template static void PrevectorDeserialize(benchmark::Bench& bench) { DataStream s0{}; - prevector<28, T> t0; - t0.resize(28); + prevector<36, T> t0; + t0.resize(36); for (auto x = 0; x < 900; ++x) { s0 << t0; } @@ -74,7 +74,7 @@ static void PrevectorDeserialize(benchmark::Bench& bench) s0 << t0; } bench.batch(1000).run([&] { - prevector<28, T> t1; + prevector<36, T> t1; for (auto x = 0; x < 1000; ++x) { s0 >> t1; } @@ -86,7 +86,7 @@ template static void PrevectorFillVectorDirect(benchmark::Bench& bench) { bench.run([&] { - std::vector> vec; + std::vector> vec; vec.reserve(260); for (size_t i = 0; i < 260; ++i) { vec.emplace_back(); @@ -99,11 +99,11 @@ template static void PrevectorFillVectorIndirect(benchmark::Bench& bench) { bench.run([&] { - std::vector> vec; + std::vector> vec; vec.reserve(260); for (size_t i = 0; i < 260; ++i) { // force allocation - vec.emplace_back(29, T{}); + vec.emplace_back(37, T{}); } }); } diff --git a/src/script/script.h b/src/script/script.h index f4579849803..9c6d25eb157 100644 --- a/src/script/script.h +++ b/src/script/script.h @@ -406,7 +406,7 @@ private: * Tests in October 2015 showed use of this reduced dbcache memory usage by 23% * and made an initial sync 13% faster. */ -typedef prevector<28, unsigned char> CScriptBase; +typedef prevector<36, unsigned char> CScriptBase; bool GetScriptOp(CScriptBase::const_iterator& pc, CScriptBase::const_iterator end, opcodetype& opcodeRet, std::vector* pvchRet); diff --git a/src/test/script_tests.cpp b/src/test/script_tests.cpp index 700a0fd3427..f240fe8949b 100644 --- a/src/test/script_tests.cpp +++ b/src/test/script_tests.cpp @@ -1131,10 +1131,10 @@ BOOST_AUTO_TEST_CASE(script_CHECKMULTISIG23) BOOST_AUTO_TEST_CASE(script_size_and_capacity_test) { - BOOST_CHECK_EQUAL(sizeof(prevector<28, unsigned char>), 32); - BOOST_CHECK_EQUAL(sizeof(CScriptBase), 32); - BOOST_CHECK_EQUAL(sizeof(CScript), 32); - BOOST_CHECK_EQUAL(sizeof(CTxOut), 40); + BOOST_CHECK_EQUAL(sizeof(prevector<34, uint8_t>), sizeof(prevector<36, uint8_t>)); + BOOST_CHECK_EQUAL(sizeof(CScriptBase), 40); + BOOST_CHECK_EQUAL(sizeof(CScript), 40); + BOOST_CHECK_EQUAL(sizeof(CTxOut), 48); CKey dummyKey; dummyKey.MakeNewKey(true); @@ -1146,7 +1146,7 @@ BOOST_AUTO_TEST_CASE(script_size_and_capacity_test) const auto scriptSmallOpReturn{CScript() << OP_RETURN << std::vector(10, 0xaa)}; BOOST_CHECK_EQUAL(Solver(scriptSmallOpReturn, dummyVSolutions), TxoutType::NULL_DATA); BOOST_CHECK_EQUAL(scriptSmallOpReturn.size(), 12); - BOOST_CHECK_EQUAL(scriptSmallOpReturn.capacity(), 28); + BOOST_CHECK_EQUAL(scriptSmallOpReturn.capacity(), 36); BOOST_CHECK_EQUAL(scriptSmallOpReturn.allocated_memory(), 0); } @@ -1155,7 +1155,7 @@ BOOST_AUTO_TEST_CASE(script_size_and_capacity_test) const auto scriptP2WPKH{GetScriptForDestination(WitnessV0KeyHash{PKHash{CKeyID{CPubKey{dummyKey.GetPubKey()}.GetID()}}})}; BOOST_CHECK_EQUAL(Solver(scriptP2WPKH, dummyVSolutions), TxoutType::WITNESS_V0_KEYHASH); BOOST_CHECK_EQUAL(scriptP2WPKH.size(), 22); - BOOST_CHECK_EQUAL(scriptP2WPKH.capacity(), 28); + BOOST_CHECK_EQUAL(scriptP2WPKH.capacity(), 36); BOOST_CHECK_EQUAL(scriptP2WPKH.allocated_memory(), 0); } @@ -1164,7 +1164,7 @@ BOOST_AUTO_TEST_CASE(script_size_and_capacity_test) const auto scriptP2SH{GetScriptForDestination(ScriptHash{CScript{} << OP_TRUE})}; BOOST_CHECK(scriptP2SH.IsPayToScriptHash()); BOOST_CHECK_EQUAL(scriptP2SH.size(), 23); - BOOST_CHECK_EQUAL(scriptP2SH.capacity(), 28); + BOOST_CHECK_EQUAL(scriptP2SH.capacity(), 36); BOOST_CHECK_EQUAL(scriptP2SH.allocated_memory(), 0); } @@ -1173,35 +1173,35 @@ BOOST_AUTO_TEST_CASE(script_size_and_capacity_test) const auto scriptP2PKH{GetScriptForDestination(PKHash{CKeyID{CPubKey{dummyKey.GetPubKey()}.GetID()}})}; BOOST_CHECK_EQUAL(Solver(scriptP2PKH, dummyVSolutions), TxoutType::PUBKEYHASH); BOOST_CHECK_EQUAL(scriptP2PKH.size(), 25); - BOOST_CHECK_EQUAL(scriptP2PKH.capacity(), 28); + BOOST_CHECK_EQUAL(scriptP2PKH.capacity(), 36); BOOST_CHECK_EQUAL(scriptP2PKH.allocated_memory(), 0); } - // P2WSH is heap allocated + // P2WSH is stack allocated { const auto scriptP2WSH{GetScriptForDestination(WitnessV0ScriptHash{CScript{} << OP_TRUE})}; BOOST_CHECK(scriptP2WSH.IsPayToWitnessScriptHash()); BOOST_CHECK_EQUAL(scriptP2WSH.size(), 34); - BOOST_CHECK_EQUAL(scriptP2WSH.capacity(), 34); - BOOST_CHECK_EQUAL(scriptP2WSH.allocated_memory(), 34); + BOOST_CHECK_EQUAL(scriptP2WSH.capacity(), 36); + BOOST_CHECK_EQUAL(scriptP2WSH.allocated_memory(), 0); } - // P2TR is heap allocated + // P2TR is stack allocated { const auto scriptTaproot{GetScriptForDestination(WitnessV1Taproot{XOnlyPubKey{CPubKey{dummyKey.GetPubKey()}}})}; BOOST_CHECK_EQUAL(Solver(scriptTaproot, dummyVSolutions), TxoutType::WITNESS_V1_TAPROOT); BOOST_CHECK_EQUAL(scriptTaproot.size(), 34); - BOOST_CHECK_EQUAL(scriptTaproot.capacity(), 34); - BOOST_CHECK_EQUAL(scriptTaproot.allocated_memory(), 34); + BOOST_CHECK_EQUAL(scriptTaproot.capacity(), 36); + BOOST_CHECK_EQUAL(scriptTaproot.allocated_memory(), 0); } - // P2PK is heap allocated + // P2PK is stack allocated { const auto scriptPubKey{GetScriptForRawPubKey(CPubKey{dummyKey.GetPubKey()})}; BOOST_CHECK_EQUAL(Solver(scriptPubKey, dummyVSolutions), TxoutType::PUBKEY); BOOST_CHECK_EQUAL(scriptPubKey.size(), 35); - BOOST_CHECK_EQUAL(scriptPubKey.capacity(), 35); - BOOST_CHECK_EQUAL(scriptPubKey.allocated_memory(), 35); + BOOST_CHECK_EQUAL(scriptPubKey.capacity(), 36); + BOOST_CHECK_EQUAL(scriptPubKey.allocated_memory(), 0); } // MULTISIG is always heap allocated diff --git a/src/test/validation_flush_tests.cpp b/src/test/validation_flush_tests.cpp index c325f7deb2b..b109c5be078 100644 --- a/src/test/validation_flush_tests.cpp +++ b/src/test/validation_flush_tests.cpp @@ -26,10 +26,8 @@ BOOST_AUTO_TEST_CASE(getcoinscachesizestate) LOCK(::cs_main); auto& view = chainstate.CoinsTip(); - // The number of bytes consumed by coin's heap data, i.e. CScript - // (prevector<28, unsigned char>) when assigned 56 bytes of data per above. - // - // See also: Coin::DynamicMemoryUsage(). + // The number of bytes consumed by coin's heap data, i.e. CScript (prevector<36, unsigned char>) + // when assigned 56 bytes of data per above. See also: Coin::DynamicMemoryUsage(). constexpr unsigned int COIN_SIZE = is_64_bit ? 80 : 64; auto print_view_mem_usage = [](CCoinsViewCache& view) {