From 3d203c2acfc55b84a7aa29272454b6b196051e43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Thu, 24 Oct 2024 08:37:57 +0200
Subject: [PATCH 01/24] test: Compare util::Xor with randomized inputs against
 simple impl

Since production code only uses keys of length 8, we're not testing with other values anymore
---
 src/test/streams_tests.cpp | 93 ++++++++++++++++++++++++++++++++------
 1 file changed, 79 insertions(+), 14 deletions(-)
diff --git a/src/test/streams_tests.cpp b/src/test/streams_tests.cpp
index c7b5cd353e0..c29f34fab58 100644
--- a/src/test/streams_tests.cpp
+++ b/src/test/streams_tests.cpp
@@ -13,19 +13,81 @@
 #include <boost/test/unit_test.hpp>
 
 using namespace std::string_literals;
+using namespace util::hex_literals;
 
 BOOST_FIXTURE_TEST_SUITE(streams_tests, BasicTestingSetup)
 
+// Test that obfuscation can be properly reversed even with random chunk sizes.
+BOOST_AUTO_TEST_CASE(xor_roundtrip_random_chunks)
+{
+    auto apply_random_xor_chunks{[&](std::span<std::byte> write, const std::span<std::byte> key) {
+        for (size_t offset{0}; offset < write.size();) {
+            const size_t chunk_size{1 + m_rng.randrange(write.size() - offset)};
+            util::Xor(write.subspan(offset, chunk_size), key, offset);
+            offset += chunk_size;
+        }
+    }};
+
+    for (size_t test{0}; test < 100; ++test) {
+        const size_t write_size{1 + m_rng.randrange(100U)};
+        const std::vector original{m_rng.randbytes<std::byte>(write_size)};
+        std::vector roundtrip{original};
+
+        std::vector key_bytes{m_rng.randbytes<std::byte>(sizeof(uint64_t))};
+        uint64_t xor_key;
+        std::memcpy(&xor_key, key_bytes.data(), sizeof(xor_key));
+        apply_random_xor_chunks(roundtrip, key_bytes);
+
+        // Verify intermediate state is different from original (unless key is zero)
+        const bool all_zero = (xor_key == 0) || (HexStr(key_bytes).find_first_not_of('0') >= write_size * 2);
+        BOOST_CHECK_EQUAL(original != roundtrip, !all_zero);
+
+        apply_random_xor_chunks(roundtrip, key_bytes);
+        BOOST_CHECK(original == roundtrip);
+    }
+}
+
+// Compares optimized obfuscation against a trivial byte-by-byte reference implementation
+// with random offsets to ensure proper handling of key wrapping.
+BOOST_AUTO_TEST_CASE(xor_bytes_reference)
+{
+    auto expected_xor{[](std::span<std::byte> write, const std::span<const std::byte> key, size_t key_offset) {
+        for (auto& b : write) {
+            b ^= key[key_offset++ % key.size()];
+        }
+    }};
+
+    for (size_t test{0}; test < 100; ++test) {
+        const size_t write_size{1 + m_rng.randrange(100U)};
+        const size_t key_offset{m_rng.randrange(3 * 8U)}; // Should wrap around
+
+        std::vector key_bytes{m_rng.randbytes<std::byte>(sizeof(uint64_t))};
+        uint64_t xor_key;
+        std::memcpy(&xor_key, key_bytes.data(), sizeof(xor_key));
+
+        std::vector expected{m_rng.randbytes<std::byte>(write_size)};
+        std::vector actual{expected};
+
+        expected_xor(expected, key_bytes, key_offset);
+        util::Xor(actual, key_bytes, key_offset);
+
+        BOOST_CHECK_EQUAL_COLLECTIONS(expected.begin(), expected.end(), actual.begin(), actual.end());
+    }
+}
+
 BOOST_AUTO_TEST_CASE(xor_file)
 {
     fs::path xor_path{m_args.GetDataDirBase() / "test_xor.bin"};
     auto raw_file{[&](const auto& mode) { return fsbridge::fopen(xor_path, mode); }};
     const std::vector<uint8_t> test1{1, 2, 3};
     const std::vector<uint8_t> test2{4, 5};
-    const std::vector<std::byte> xor_pat{std::byte{0xff}, std::byte{0x00}};
+    auto key_bytes{"ff00ff00ff00ff00"_hex_v};
+    uint64_t xor_key;
+    std::memcpy(&xor_key, key_bytes.data(), sizeof(xor_key));
+
     {
         // Check errors for missing file
-        AutoFile xor_file{raw_file("rb"), xor_pat};
+        AutoFile xor_file{raw_file("rb"), key_bytes};
         BOOST_CHECK_EXCEPTION(xor_file << std::byte{}, std::ios_base::failure, HasReason{"AutoFile::write: file handle is nullpt"});
         BOOST_CHECK_EXCEPTION(xor_file >> std::byte{}, std::ios_base::failure, HasReason{"AutoFile::read: file handle is nullpt"});
         BOOST_CHECK_EXCEPTION(xor_file.ignore(1), std::ios_base::failure, HasReason{"AutoFile::ignore: file handle is nullpt"});
@@ -37,7 +99,7 @@ BOOST_AUTO_TEST_CASE(xor_file)
 #else
         const char* mode = "wbx";
 #endif
-        AutoFile xor_file{raw_file(mode), xor_pat};
+        AutoFile xor_file{raw_file(mode), key_bytes};
         xor_file << test1 << test2;
     }
     {
@@ -50,7 +112,7 @@ BOOST_AUTO_TEST_CASE(xor_file)
         BOOST_CHECK_EXCEPTION(non_xor_file.ignore(1), std::ios_base::failure, HasReason{"AutoFile::ignore: end of file"});
     }
     {
-        AutoFile xor_file{raw_file("rb"), xor_pat};
+        AutoFile xor_file{raw_file("rb"), key_bytes};
         std::vector<std::byte> read1, read2;
         xor_file >> read1 >> read2;
         BOOST_CHECK_EQUAL(HexStr(read1), HexStr(test1));
@@ -59,7 +121,7 @@ BOOST_AUTO_TEST_CASE(xor_file)
         BOOST_CHECK_EXCEPTION(xor_file >> std::byte{}, std::ios_base::failure, HasReason{"AutoFile::read: end of file"});
     }
     {
-        AutoFile xor_file{raw_file("rb"), xor_pat};
+        AutoFile xor_file{raw_file("rb"), key_bytes};
         std::vector<std::byte> read2;
         // Check that ignore works
         xor_file.ignore(4);
@@ -75,7 +137,7 @@ BOOST_AUTO_TEST_CASE(streams_vector_writer)
 {
     unsigned char a(1);
     unsigned char b(2);
-    unsigned char bytes[] = { 3, 4, 5, 6 };
+    unsigned char bytes[] = {3, 4, 5, 6};
     std::vector<unsigned char> vch;
 
     // Each test runs twice. Serializing a second time at the same starting
@@ -227,29 +289,32 @@ BOOST_AUTO_TEST_CASE(streams_serializedata_xor)
     // Degenerate case
     {
         DataStream ds{in};
-        ds.Xor({0x00, 0x00});
+        ds.Xor("0000000000000000"_hex_v_u8);
         BOOST_CHECK_EQUAL(""s, ds.str());
     }
 
     in.push_back(std::byte{0x0f});
     in.push_back(std::byte{0xf0});
 
-    // Single character key
     {
+        const auto key_bytes{"ffffffffffffffff"_hex_v};
+        uint64_t xor_key;
+        std::memcpy(&xor_key, key_bytes.data(), sizeof(xor_key));
+
         DataStream ds{in};
-        ds.Xor({0xff});
+        ds.Xor("ffffffffffffffff"_hex_v_u8);
         BOOST_CHECK_EQUAL("\xf0\x0f"s, ds.str());
     }
 
-    // Multi character key
-
     in.clear();
     in.push_back(std::byte{0xf0});
     in.push_back(std::byte{0x0f});
 
     {
+        const auto key_bytes{"ff0fff0fff0fff0f"_hex_v};
+
         DataStream ds{in};
-        ds.Xor({0xff, 0x0f});
+        ds.Xor("ff0fff0fff0fff0f"_hex_v_u8);
         BOOST_CHECK_EQUAL("\x0f\x00"s, ds.str());
     }
 }
@@ -272,7 +337,7 @@ BOOST_AUTO_TEST_CASE(streams_buffered_file)
         BOOST_CHECK(false);
     } catch (const std::exception& e) {
         BOOST_CHECK(strstr(e.what(),
-                        "Rewind limit must be less than buffer size") != nullptr);
+                           "Rewind limit must be less than buffer size") != nullptr);
     }
 
     // The buffer is 25 bytes, allow rewinding 10 bytes.
@@ -361,7 +426,7 @@ BOOST_AUTO_TEST_CASE(streams_buffered_file)
         BOOST_CHECK(false);
     } catch (const std::exception& e) {
         BOOST_CHECK(strstr(e.what(),
-                        "BufferedFile::Fill: end of file") != nullptr);
+                           "BufferedFile::Fill: end of file") != nullptr);
     }
     // Attempting to read beyond the end sets the EOF indicator.
     BOOST_CHECK(bf.eof());

From 5d5f3d06dd845b4b4f3a7839dd86d8d8fcc4b098 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Fri, 6 Dec 2024 16:18:03 +0100
Subject: [PATCH 02/24] bench: Make XorObfuscationBench more representative

Since another PR solves the tiny byte xors during serialization, we're only concentrating on big continuous chunks now.

>  cmake -B build -DBUILD_BENCH=ON -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build -j$(nproc) \
&& build/bin/bench_bitcoin -filter='XorObfuscationBench' -min-time=10000

C++ compiler .......................... AppleClang 17.0.0.17000013

|              ns/MiB |               MiB/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|          731,927.62 |            1,366.26 |    0.2% |     10.67 | `XorObfuscationBench`

C++ compiler .......................... GNU 13.3.0

|              ns/MiB |               MiB/s |    err% |         ins/MiB |         cyc/MiB |    IPC |        bra/MiB |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|          941,015.26 |            1,062.68 |    0.0% |    9,437,186.97 |    3,378,911.52 |  2.793 |   1,048,577.15 |    0.0% |     10.99 | `XorObfuscationBench`
---
 src/bench/xor.cpp | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/src/bench/xor.cpp b/src/bench/xor.cpp
index fc9dc5d1721..c76103e8851 100644
--- a/src/bench/xor.cpp
+++ b/src/bench/xor.cpp
@@ -6,19 +6,28 @@
 #include <random.h>
 #include <span.h>
 #include <streams.h>
+#include <util/byte_units.h>
 
+#include <cmath>
 #include <cstddef>
+#include <map>
 #include <vector>
 
-static void Xor(benchmark::Bench& bench)
+static void XorObfuscationBench(benchmark::Bench& bench)
 {
-    FastRandomContext frc{/*fDeterministic=*/true};
-    auto data{frc.randbytes<std::byte>(1024)};
-    auto key{frc.randbytes<std::byte>(31)};
+    FastRandomContext rng{/*fDeterministic=*/true};
+    constexpr size_t bytes{10_MiB};
+    auto test_data{rng.randbytes<std::byte>(bytes)};
 
-    bench.batch(data.size()).unit("byte").run([&] {
-        util::Xor(data, key);
+    std::vector key_bytes{rng.randbytes<std::byte>(8)};
+    uint64_t key;
+    std::memcpy(&key, key_bytes.data(), 8);
+
+    size_t offset{0};
+    bench.batch(bytes / 1_MiB).unit("MiB").run([&] {
+        util::Xor(test_data, key_bytes, offset++);
+        ankerl::nanobench::doNotOptimizeAway(test_data);
     });
 }
 
-BENCHMARK(Xor, benchmark::PriorityLevel::HIGH);
+BENCHMARK(XorObfuscationBench, benchmark::PriorityLevel::HIGH);

From 8e6e0acd36eb60685b2340e00861cb13fcef1111 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sat, 5 Apr 2025 18:25:20 +0200
Subject: [PATCH 03/24] refactor: prepare dbwrapper for obfuscation key change

Since `CDBWrapper::Read` will still work with vectors, we won't be able to use the obfuscation key field to read into it directly.
This commit cleans up this part of the code, obviating that writing `obfuscate_key` is needed since following methods will actually use it implicitly, simplifying the `if (!key_exists` condition to extract the negation into the name of the boolean and inline the single-use `CreateObfuscateKey` which will just complicate the transition.
---
 src/dbwrapper.cpp | 49 +++++++++++++++++------------------------------
 src/dbwrapper.h   |  2 --
 2 files changed, 18 insertions(+), 33 deletions(-)

diff --git a/src/dbwrapper.cpp b/src/dbwrapper.cpp
index 1c35e11863b..38b75b3b340 100644
--- a/src/dbwrapper.cpp
+++ b/src/dbwrapper.cpp
@@ -213,7 +213,10 @@ struct LevelDBContext {
 };
 
 CDBWrapper::CDBWrapper(const DBParams& params)
-    : m_db_context{std::make_unique<LevelDBContext>()}, m_name{fs::PathToString(params.path.stem())}, m_path{params.path}, m_is_memory{params.memory_only}
+    : m_db_context{std::make_unique<LevelDBContext>()},
+      m_name{fs::PathToString(params.path.stem())},
+      m_path{params.path},
+      m_is_memory{params.memory_only}
 {
     DBContext().penv = nullptr;
     DBContext().readoptions.verify_checksums = true;
@@ -248,24 +251,23 @@ CDBWrapper::CDBWrapper(const DBParams& params)
         LogPrintf("Finished database compaction of %s\n", fs::PathToString(params.path));
     }
 
-    // The base-case obfuscation key, which is a noop.
-    obfuscate_key = std::vector<unsigned char>(OBFUSCATE_KEY_NUM_BYTES, '\000');
+    {
+        obfuscate_key = std::vector<unsigned char>(OBFUSCATE_KEY_NUM_BYTES, '\000'); // Needed for unobfuscated Read
+        const bool key_missing{!Read(OBFUSCATE_KEY_KEY, obfuscate_key)};
+        if (key_missing && params.obfuscate && IsEmpty()) {
+            // Initialize non-degenerate obfuscation if it won't upset existing, non-obfuscated data.
+            std::vector<uint8_t> new_key(OBFUSCATE_KEY_NUM_BYTES);
+            GetRandBytes(new_key);
 
-    bool key_exists = Read(OBFUSCATE_KEY_KEY, obfuscate_key);
+            // Write `new_key` so we don't obfuscate the key with itself
+            Write(OBFUSCATE_KEY_KEY, new_key);
+            obfuscate_key = new_key;
 
-    if (!key_exists && params.obfuscate && IsEmpty()) {
-        // Initialize non-degenerate obfuscation if it won't upset
-        // existing, non-obfuscated data.
-        std::vector<unsigned char> new_key = CreateObfuscateKey();
+            LogPrintf("Wrote new obfuscate key for %s: %s\n", fs::PathToString(params.path), HexStr(obfuscate_key));
+        }
 
-        // Write `new_key` so we don't obfuscate the key with itself
-        Write(OBFUSCATE_KEY_KEY, new_key);
-        obfuscate_key = new_key;
-
-        LogPrintf("Wrote new obfuscate key for %s: %s\n", fs::PathToString(params.path), HexStr(obfuscate_key));
+        LogPrintf("Using obfuscation key for %s: %s\n", fs::PathToString(params.path), HexStr(obfuscate_key));
     }
-
-    LogPrintf("Using obfuscation key for %s: %s\n", fs::PathToString(params.path), HexStr(obfuscate_key));
 }
 
 CDBWrapper::~CDBWrapper()
@@ -318,17 +320,6 @@ const std::string CDBWrapper::OBFUSCATE_KEY_KEY("\000obfuscate_key", 14);
 
 const unsigned int CDBWrapper::OBFUSCATE_KEY_NUM_BYTES = 8;
 
-/**
- * Returns a string (consisting of 8 random bytes) suitable for use as an
- * obfuscating XOR key.
- */
-std::vector<unsigned char> CDBWrapper::CreateObfuscateKey() const
-{
-    std::vector<uint8_t> ret(OBFUSCATE_KEY_NUM_BYTES);
-    GetRandBytes(ret);
-    return ret;
-}
-
 std::optional<std::string> CDBWrapper::ReadImpl(std::span<const std::byte> key) const
 {
     leveldb::Slice slKey(CharCast(key.data()), key.size());
@@ -411,10 +402,6 @@ void CDBIterator::SeekToFirst() { m_impl_iter->iter->SeekToFirst(); }
 void CDBIterator::Next() { m_impl_iter->iter->Next(); }
 
 namespace dbwrapper_private {
-
-const std::vector<unsigned char>& GetObfuscateKey(const CDBWrapper &w)
-{
-    return w.obfuscate_key;
-}
+const std::vector<unsigned char>& GetObfuscateKey(const CDBWrapper& w) { return w.obfuscate_key; }
 
 } // namespace dbwrapper_private
diff --git a/src/dbwrapper.h b/src/dbwrapper.h
index 789b5be8fc7..33e009833cb 100644
--- a/src/dbwrapper.h
+++ b/src/dbwrapper.h
@@ -196,8 +196,6 @@ private:
     //! the length of the obfuscate key in number of bytes
     static const unsigned int OBFUSCATE_KEY_NUM_BYTES;
 
-    std::vector<unsigned char> CreateObfuscateKey() const;
-
     //! path to filesystem storage
     const fs::path m_path;
 

From e50732d25fcdeb73747f27a5341116f454badc9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sat, 5 Apr 2025 19:01:09 +0200
Subject: [PATCH 04/24] refactor: prepare mempool_persist for obfuscation key
 change

---
 src/node/mempool_persist.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/node/mempool_persist.cpp b/src/node/mempool_persist.cpp
index ff7de8c64a2..320a8d60967 100644
--- a/src/node/mempool_persist.cpp
+++ b/src/node/mempool_persist.cpp
@@ -58,15 +58,18 @@ bool LoadMempool(CTxMemPool& pool, const fs::path& load_path, Chainstate& active
     try {
         uint64_t version;
         file >> version;
-        std::vector<std::byte> xor_key;
+
         if (version == MEMPOOL_DUMP_VERSION_NO_XOR_KEY) {
-            // Leave XOR-key empty
+            std::vector<std::byte> xor_key;
+            file.SetXor(xor_key);
         } else if (version == MEMPOOL_DUMP_VERSION) {
+            std::vector<std::byte> xor_key;
             file >> xor_key;
+            file.SetXor(xor_key);
         } else {
             return false;
         }
-        file.SetXor(xor_key);
+
         uint64_t total_txns_to_load;
         file >> total_txns_to_load;
         uint64_t txns_tried = 0;
@@ -177,12 +180,15 @@ bool DumpMempool(const CTxMemPool& pool, const fs::path& dump_path, FopenFn mock
         const uint64_t version{pool.m_opts.persist_v1_dat ? MEMPOOL_DUMP_VERSION_NO_XOR_KEY : MEMPOOL_DUMP_VERSION};
         file << version;
 
-        std::vector<std::byte> xor_key(8);
         if (!pool.m_opts.persist_v1_dat) {
+            std::vector<std::byte> xor_key(8);
             FastRandomContext{}.fillrand(xor_key);
             file << xor_key;
+            file.SetXor(xor_key);
+        } else {
+            std::vector<std::byte> xor_key(8);
+            file.SetXor(xor_key);
         }
-        file.SetXor(xor_key);
 
         uint64_t mempool_transactions_to_write(vinfo.size());
         file << mempool_transactions_to_write;

From c5e866b190d6d77302c7bf5f89f8ecace53fc604 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sat, 21 Dec 2024 16:15:04 +0100
Subject: [PATCH 05/24] optimization: Migrate fixed-size obfuscation end-to-end
 from `std::vector<std::byte>` to `uint64_t`

Since `util::Xor` accepts `uint64_t` values, we're eliminating any repeated vector-to-uint64_t conversions going back to the loading/saving of these values (we're still serializing them as vectors, but converting as soon as possible to `uint64_t`). This is the reason the tests still generate vector values and convert to `uint64_t` later instead of generating it directly.

We're also short-circuit `Xor` calls with 0 key values early to avoid unnecessary calculations (e.g. `MakeWritableByteSpan`) - even assuming that XOR is never called for 0.

>  cmake -B build -DBUILD_BENCH=ON -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build -j$(nproc) \
&& build/bin/bench_bitcoin -filter='XorObfuscationBench' -min-time=10000

C++ compiler .......................... AppleClang 17.0.0.17000013

|              ns/MiB |               MiB/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|           14,730.40 |           67,886.80 |    0.1% |     11.01 | `XorObfuscationBench`

C++ compiler .......................... GNU 13.3.0

|              ns/MiB |               MiB/s |    err% |         ins/MiB |         cyc/MiB |    IPC |        bra/MiB |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|           51,187.17 |           19,536.15 |    0.0% |      327,683.95 |      183,747.58 |  1.783 |      65,536.55 |    0.0% |     11.00 | `XorObfuscationBench`

----

A few other benchmarks that seem to have improved as well (tested with Clang only):
Before:

|               ns/op |                op/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|        2,202,618.49 |              454.01 |    0.2% |     11.01 | `ReadBlockBench`
|          734,444.92 |            1,361.57 |    0.3% |     10.66 | `ReadRawBlockBench`

After:

|               ns/op |                op/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|        1,912,308.06 |              522.93 |    0.4% |     10.98 | `ReadBlockBench`
|           49,092.93 |           20,369.53 |    0.2% |     10.99 | `ReadRawBlockBench`
---
 src/bench/xor.cpp               |  7 ++-
 src/dbwrapper.cpp               | 24 ++++-----
 src/dbwrapper.h                 | 14 ++---
 src/node/blockstorage.cpp       | 10 ++--
 src/node/blockstorage.h         |  2 +-
 src/node/mempool_persist.cpp    | 19 +++----
 src/obfuscation.h               | 85 ++++++++++++++++++++++++++++++
 src/streams.cpp                 | 19 ++++---
 src/streams.h                   | 40 +++-----------
 src/test/dbwrapper_tests.cpp    | 18 ++-----
 src/test/fuzz/autofile.cpp      |  2 +-
 src/test/fuzz/buffered_file.cpp |  2 +-
 src/test/streams_tests.cpp      | 92 ++++++++++++++++++++++++---------
 13 files changed, 209 insertions(+), 125 deletions(-)
 create mode 100644 src/obfuscation.h

diff --git a/src/bench/xor.cpp b/src/bench/xor.cpp
index c76103e8851..b04f08bba5e 100644
--- a/src/bench/xor.cpp
+++ b/src/bench/xor.cpp
@@ -19,13 +19,12 @@ static void XorObfuscationBench(benchmark::Bench& bench)
     constexpr size_t bytes{10_MiB};
     auto test_data{rng.randbytes<std::byte>(bytes)};
 
-    std::vector key_bytes{rng.randbytes<std::byte>(8)};
-    uint64_t key;
-    std::memcpy(&key, key_bytes.data(), 8);
+    const Obfuscation obfuscation{rng.rand64()};
+    assert(obfuscation);
 
     size_t offset{0};
     bench.batch(bytes / 1_MiB).unit("MiB").run([&] {
-        util::Xor(test_data, key_bytes, offset++);
+        obfuscation(test_data, offset++);
         ankerl::nanobench::doNotOptimizeAway(test_data);
     });
 }
diff --git a/src/dbwrapper.cpp b/src/dbwrapper.cpp
index 38b75b3b340..00317138d5d 100644
--- a/src/dbwrapper.cpp
+++ b/src/dbwrapper.cpp
@@ -173,7 +173,7 @@ void CDBBatch::Clear()
 void CDBBatch::WriteImpl(std::span<const std::byte> key, DataStream& ssValue)
 {
     leveldb::Slice slKey(CharCast(key.data()), key.size());
-    ssValue.Xor(dbwrapper_private::GetObfuscateKey(parent));
+    dbwrapper_private::GetObfuscation(parent)(ssValue);
     leveldb::Slice slValue(CharCast(ssValue.data()), ssValue.size());
     m_impl_batch->batch.Put(slKey, slValue);
 }
@@ -215,6 +215,7 @@ struct LevelDBContext {
 CDBWrapper::CDBWrapper(const DBParams& params)
     : m_db_context{std::make_unique<LevelDBContext>()},
       m_name{fs::PathToString(params.path.stem())},
+      m_obfuscation{0},
       m_path{params.path},
       m_is_memory{params.memory_only}
 {
@@ -252,21 +253,22 @@ CDBWrapper::CDBWrapper(const DBParams& params)
     }
 
     {
-        obfuscate_key = std::vector<unsigned char>(OBFUSCATE_KEY_NUM_BYTES, '\000'); // Needed for unobfuscated Read
-        const bool key_missing{!Read(OBFUSCATE_KEY_KEY, obfuscate_key)};
+        m_obfuscation = 0; // Needed for unobfuscated Read
+        std::vector<unsigned char> obfuscate_key_vector(Obfuscation::SIZE_BYTES, '\000');
+        const bool key_missing{!Read(OBFUSCATE_KEY_KEY, obfuscate_key_vector)};
         if (key_missing && params.obfuscate && IsEmpty()) {
             // Initialize non-degenerate obfuscation if it won't upset existing, non-obfuscated data.
-            std::vector<uint8_t> new_key(OBFUSCATE_KEY_NUM_BYTES);
+            std::vector<uint8_t> new_key(Obfuscation::SIZE_BYTES);
             GetRandBytes(new_key);
 
             // Write `new_key` so we don't obfuscate the key with itself
             Write(OBFUSCATE_KEY_KEY, new_key);
-            obfuscate_key = new_key;
+            obfuscate_key_vector = new_key;
 
-            LogPrintf("Wrote new obfuscate key for %s: %s\n", fs::PathToString(params.path), HexStr(obfuscate_key));
+            LogPrintf("Wrote new obfuscate key for %s: %s\n", fs::PathToString(params.path), HexStr(obfuscate_key_vector));
         }
-
-        LogPrintf("Using obfuscation key for %s: %s\n", fs::PathToString(params.path), HexStr(obfuscate_key));
+        LogPrintf("Using obfuscation key for %s: %s\n", fs::PathToString(params.path), HexStr(obfuscate_key_vector));
+        m_obfuscation = obfuscate_key_vector;
     }
 }
 
@@ -317,9 +319,6 @@ size_t CDBWrapper::DynamicMemoryUsage() const
 // We must use a string constructor which specifies length so that we copy
 // past the null-terminator.
 const std::string CDBWrapper::OBFUSCATE_KEY_KEY("\000obfuscate_key", 14);
-
-const unsigned int CDBWrapper::OBFUSCATE_KEY_NUM_BYTES = 8;
-
 std::optional<std::string> CDBWrapper::ReadImpl(std::span<const std::byte> key) const
 {
     leveldb::Slice slKey(CharCast(key.data()), key.size());
@@ -402,6 +401,5 @@ void CDBIterator::SeekToFirst() { m_impl_iter->iter->SeekToFirst(); }
 void CDBIterator::Next() { m_impl_iter->iter->Next(); }
 
 namespace dbwrapper_private {
-const std::vector<unsigned char>& GetObfuscateKey(const CDBWrapper& w) { return w.obfuscate_key; }
-
+Obfuscation GetObfuscation(const CDBWrapper& w) { return w.m_obfuscation; }
 } // namespace dbwrapper_private
diff --git a/src/dbwrapper.h b/src/dbwrapper.h
index 33e009833cb..5f23f1dc433 100644
--- a/src/dbwrapper.h
+++ b/src/dbwrapper.h
@@ -63,8 +63,7 @@ namespace dbwrapper_private {
  * Database obfuscation should be considered an implementation detail of the
  * specific database.
  */
-const std::vector<unsigned char>& GetObfuscateKey(const CDBWrapper &w);
-
+Obfuscation GetObfuscation(const CDBWrapper&);
 }; // namespace dbwrapper_private
 
 bool DestroyDB(const std::string& path_str);
@@ -166,7 +165,7 @@ public:
     template<typename V> bool GetValue(V& value) {
         try {
             DataStream ssValue{GetValueImpl()};
-            ssValue.Xor(dbwrapper_private::GetObfuscateKey(parent));
+            dbwrapper_private::GetObfuscation(parent)(ssValue);
             ssValue >> value;
         } catch (const std::exception&) {
             return false;
@@ -179,7 +178,7 @@ struct LevelDBContext;
 
 class CDBWrapper
 {
-    friend const std::vector<unsigned char>& dbwrapper_private::GetObfuscateKey(const CDBWrapper &w);
+    friend Obfuscation dbwrapper_private::GetObfuscation(const CDBWrapper&);
 private:
     //! holds all leveldb-specific fields of this class
     std::unique_ptr<LevelDBContext> m_db_context;
@@ -188,14 +187,11 @@ private:
     std::string m_name;
 
     //! a key used for optional XOR-obfuscation of the database
-    std::vector<unsigned char> obfuscate_key;
+    Obfuscation m_obfuscation;
 
     //! the key under which the obfuscation key is stored
     static const std::string OBFUSCATE_KEY_KEY;
 
-    //! the length of the obfuscate key in number of bytes
-    static const unsigned int OBFUSCATE_KEY_NUM_BYTES;
-
     //! path to filesystem storage
     const fs::path m_path;
 
@@ -226,7 +222,7 @@ public:
         }
         try {
             DataStream ssValue{MakeByteSpan(*strValue)};
-            ssValue.Xor(obfuscate_key);
+            m_obfuscation(ssValue);
             ssValue >> value;
         } catch (const std::exception&) {
             return false;
diff --git a/src/node/blockstorage.cpp b/src/node/blockstorage.cpp
index 3d73459bf6c..d2c1f332054 100644
--- a/src/node/blockstorage.cpp
+++ b/src/node/blockstorage.cpp
@@ -777,13 +777,13 @@ void BlockManager::UnlinkPrunedFiles(const std::set<int>& setFilesToPrune) const
 
 AutoFile BlockManager::OpenBlockFile(const FlatFilePos& pos, bool fReadOnly) const
 {
-    return AutoFile{m_block_file_seq.Open(pos, fReadOnly), m_xor_key};
+    return AutoFile{m_block_file_seq.Open(pos, fReadOnly), m_obfuscation};
 }
 
 /** Open an undo file (rev?????.dat) */
 AutoFile BlockManager::OpenUndoFile(const FlatFilePos& pos, bool fReadOnly) const
 {
-    return AutoFile{m_undo_file_seq.Open(pos, fReadOnly), m_xor_key};
+    return AutoFile{m_undo_file_seq.Open(pos, fReadOnly), m_obfuscation};
 }
 
 fs::path BlockManager::GetBlockPosFilename(const FlatFilePos& pos) const
@@ -1103,7 +1103,7 @@ FlatFilePos BlockManager::WriteBlock(const CBlock& block, int nHeight)
     return pos;
 }
 
-static auto InitBlocksdirXorKey(const BlockManager::Options& opts)
+static Obfuscation InitBlocksdirXorKey(const BlockManager::Options& opts)
 {
     // Bytes are serialized without length indicator, so this is also the exact
     // size of the XOR-key file.
@@ -1152,12 +1152,12 @@ static auto InitBlocksdirXorKey(const BlockManager::Options& opts)
         };
     }
     LogInfo("Using obfuscation key for blocksdir *.dat files (%s): '%s'\n", fs::PathToString(opts.blocks_dir), HexStr(xor_key));
-    return std::vector<std::byte>{xor_key.begin(), xor_key.end()};
+    return Obfuscation{xor_key};
 }
 
 BlockManager::BlockManager(const util::SignalInterrupt& interrupt, Options opts)
     : m_prune_mode{opts.prune_target > 0},
-      m_xor_key{InitBlocksdirXorKey(opts)},
+      m_obfuscation{InitBlocksdirXorKey(opts)},
       m_opts{std::move(opts)},
       m_block_file_seq{FlatFileSeq{m_opts.blocks_dir, "blk", m_opts.fast_prune ? 0x4000 /* 16kB */ : BLOCKFILE_CHUNK_SIZE}},
       m_undo_file_seq{FlatFileSeq{m_opts.blocks_dir, "rev", UNDOFILE_CHUNK_SIZE}},
diff --git a/src/node/blockstorage.h b/src/node/blockstorage.h
index 324f8e68605..d5ae34d72ed 100644
--- a/src/node/blockstorage.h
+++ b/src/node/blockstorage.h
@@ -235,7 +235,7 @@ private:
 
     const bool m_prune_mode;
 
-    const std::vector<std::byte> m_xor_key;
+    const Obfuscation m_obfuscation;
 
     /** Dirty block index entries. */
     std::set<CBlockIndex*> m_dirty_blockindex;
diff --git a/src/node/mempool_persist.cpp b/src/node/mempool_persist.cpp
index 320a8d60967..5ef5d3c0913 100644
--- a/src/node/mempool_persist.cpp
+++ b/src/node/mempool_persist.cpp
@@ -60,12 +60,11 @@ bool LoadMempool(CTxMemPool& pool, const fs::path& load_path, Chainstate& active
         file >> version;
 
         if (version == MEMPOOL_DUMP_VERSION_NO_XOR_KEY) {
-            std::vector<std::byte> xor_key;
-            file.SetXor(xor_key);
+            file.SetObfuscation(0);
         } else if (version == MEMPOOL_DUMP_VERSION) {
-            std::vector<std::byte> xor_key;
-            file >> xor_key;
-            file.SetXor(xor_key);
+            Obfuscation obfuscation{0};
+            file >> obfuscation;
+            file.SetObfuscation(obfuscation);
         } else {
             return false;
         }
@@ -181,13 +180,11 @@ bool DumpMempool(const CTxMemPool& pool, const fs::path& dump_path, FopenFn mock
         file << version;
 
         if (!pool.m_opts.persist_v1_dat) {
-            std::vector<std::byte> xor_key(8);
-            FastRandomContext{}.fillrand(xor_key);
-            file << xor_key;
-            file.SetXor(xor_key);
+            const Obfuscation obfuscation{FastRandomContext{}.rand64()};
+            file << obfuscation;
+            file.SetObfuscation(obfuscation);
         } else {
-            std::vector<std::byte> xor_key(8);
-            file.SetXor(xor_key);
+            file.SetObfuscation(0);
         }
 
         uint64_t mempool_transactions_to_write(vinfo.size());
diff --git a/src/obfuscation.h b/src/obfuscation.h
new file mode 100644
index 00000000000..70bc844c6d5
--- /dev/null
+++ b/src/obfuscation.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2009-present The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#ifndef BITCOIN_OBFUSCATION_H
+#define BITCOIN_OBFUSCATION_H
+
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <random>
+#include <span.h>
+#include <util/check.h>
+#include <cstring>
+#include <climits>
+#include <serialize.h>
+
+class Obfuscation
+{
+public:
+    static constexpr size_t SIZE_BYTES{sizeof(uint64_t)};
+
+private:
+    std::array<uint64_t, SIZE_BYTES> rotations; // Cached key rotations
+    void SetRotations(const uint64_t key)
+    {
+        for (size_t i{0}; i < SIZE_BYTES; ++i) {
+            size_t key_rotation_bits{CHAR_BIT * i};
+            if constexpr (std::endian::native == std::endian::big) key_rotation_bits *= -1;
+            rotations[i] = std::rotr(key, key_rotation_bits);
+        }
+    }
+
+    static uint64_t ToUint64(const std::span<const std::byte, SIZE_BYTES> key_span)
+    {
+        uint64_t key{};
+        std::memcpy(&key, key_span.data(), SIZE_BYTES);
+        return key;
+    }
+
+    static void Xor(std::span<std::byte> write, const uint64_t key, const size_t size)
+    {
+        assert(size <= write.size());
+        uint64_t raw{};
+        std::memcpy(&raw, write.data(), size);
+        raw ^= key;
+        std::memcpy(write.data(), &raw, size);
+    }
+
+public:
+    Obfuscation(const uint64_t key) { SetRotations(key); }
+    Obfuscation(const std::span<const std::byte, SIZE_BYTES> key_span) : Obfuscation(ToUint64(key_span)) {}
+    Obfuscation(const std::vector<uint8_t>& key_vec) : Obfuscation(MakeByteSpan(key_vec).first<SIZE_BYTES>()) {}
+    Obfuscation(const std::vector<std::byte>& key_vec) : Obfuscation(std::span(key_vec).first<SIZE_BYTES>()) {}
+
+    uint64_t Key() const { return rotations[0]; }
+    operator bool() const { return Key() != 0; }
+    void operator()(std::span<std::byte> write, const size_t key_offset_bytes = 0) const
+    {
+        if (!*this) return;
+        const uint64_t rot_key{rotations[key_offset_bytes % SIZE_BYTES]}; // Continue obfuscation from where we left off
+        for (; write.size() >= SIZE_BYTES; write = write.subspan(SIZE_BYTES)) { // Process multiple bytes at a time
+            Xor(write, rot_key, SIZE_BYTES);
+        }
+        Xor(write, rot_key, write.size());
+    }
+
+    template <typename Stream>
+    void Serialize(Stream& s) const
+    {
+        std::vector<std::byte> bytes(SIZE_BYTES);
+        std::memcpy(bytes.data(), &rotations[0], SIZE_BYTES);
+        s << bytes;
+    }
+
+    template <typename Stream>
+    void Unserialize(Stream& s)
+    {
+        std::vector<std::byte> bytes(SIZE_BYTES);
+        s >> bytes;
+        SetRotations(ToUint64(MakeByteSpan(bytes).first<SIZE_BYTES>()));
+    }
+};
+
+#endif // BITCOIN_OBFUSCATION_H
diff --git a/src/streams.cpp b/src/streams.cpp
index 19c2b474452..2bd142aba8c 100644
--- a/src/streams.cpp
+++ b/src/streams.cpp
@@ -9,8 +9,7 @@
 
 #include <array>
 
-AutoFile::AutoFile(std::FILE* file, std::vector<std::byte> data_xor)
-    : m_file{file}, m_xor{std::move(data_xor)}
+AutoFile::AutoFile(std::FILE* file, const Obfuscation& obfuscation) : m_file{file}, m_obfuscation{obfuscation}
 {
     if (!IsNull()) {
         auto pos{std::ftell(m_file)};
@@ -21,12 +20,12 @@ AutoFile::AutoFile(std::FILE* file, std::vector<std::byte> data_xor)
 std::size_t AutoFile::detail_fread(std::span<std::byte> dst)
 {
     if (!m_file) throw std::ios_base::failure("AutoFile::read: file handle is nullptr");
-    size_t ret = std::fread(dst.data(), 1, dst.size(), m_file);
-    if (!m_xor.empty()) {
-        if (!m_position.has_value()) throw std::ios_base::failure("AutoFile::read: position unknown");
-        util::Xor(dst.subspan(0, ret), m_xor, *m_position);
+    const size_t ret = std::fread(dst.data(), 1, dst.size(), m_file);
+    if (m_obfuscation) {
+        if (!m_position) throw std::ios_base::failure("AutoFile::read: position unknown");
+        m_obfuscation(dst, *m_position);
     }
-    if (m_position.has_value()) *m_position += ret;
+    if (m_position) *m_position += ret;
     return ret;
 }
 
@@ -81,7 +80,7 @@ void AutoFile::ignore(size_t nSize)
 void AutoFile::write(std::span<const std::byte> src)
 {
     if (!m_file) throw std::ios_base::failure("AutoFile::write: file handle is nullptr");
-    if (m_xor.empty()) {
+    if (!m_obfuscation) {
         if (std::fwrite(src.data(), 1, src.size(), m_file) != src.size()) {
             throw std::ios_base::failure("AutoFile::write: write failed");
         }
@@ -100,9 +99,9 @@ void AutoFile::write(std::span<const std::byte> src)
 void AutoFile::write_buffer(std::span<std::byte> src)
 {
     if (!m_file) throw std::ios_base::failure("AutoFile::write_buffer: file handle is nullptr");
-    if (m_xor.size()) {
+    if (m_obfuscation) {
         if (!m_position) throw std::ios_base::failure("AutoFile::write_buffer: obfuscation position unknown");
-        util::Xor(src, m_xor, *m_position); // obfuscate in-place
+        m_obfuscation(src, *m_position); // obfuscate in-place
     }
     if (std::fwrite(src.data(), 1, src.size(), m_file) != src.size()) {
         throw std::ios_base::failure("AutoFile::write_buffer: write failed");
diff --git a/src/streams.h b/src/streams.h
index 1ebcff3671f..0a1e7002c5f 100644
--- a/src/streams.h
+++ b/src/streams.h
@@ -6,6 +6,7 @@
 #ifndef BITCOIN_STREAMS_H
 #define BITCOIN_STREAMS_H
 
+#include <obfuscation.h>
 #include <serialize.h>
 #include <span.h>
 #include <support/allocators/zeroafterfree.h>
@@ -21,30 +22,8 @@
 #include <stdint.h>
 #include <string.h>
 #include <string>
-#include <utility>
 #include <vector>
 
-namespace util {
-inline void Xor(std::span<std::byte> write, std::span<const std::byte> key, size_t key_offset = 0)
-{
-    if (key.size() == 0) {
-        return;
-    }
-    key_offset %= key.size();
-
-    for (size_t i = 0, j = key_offset; i != write.size(); i++) {
-        write[i] ^= key[j++];
-
-        // This potentially acts on very many bytes of data, so it's
-        // important that we calculate `j`, i.e. the `key` index in this
-        // way instead of doing a %, which would effectively be a division
-        // for each byte Xor'd -- much slower than need be.
-        if (j == key.size())
-            j = 0;
-    }
-}
-} // namespace util
-
 /* Minimal stream for overwriting and/or appending to an existing byte vector
  *
  * The referenced vector will grow as necessary
@@ -261,21 +240,16 @@ public:
         return (*this);
     }
 
-    template<typename T>
+    template <typename T>
     DataStream& operator>>(T&& obj)
     {
         ::Unserialize(*this, obj);
         return (*this);
     }
 
-    /**
-     * XOR the contents of this stream with a certain key.
-     *
-     * @param[in] key    The key used to XOR the data in this stream.
-     */
-    void Xor(const std::vector<unsigned char>& key)
+    void Obfuscate(const Obfuscation& obfuscation)
     {
-        util::Xor(MakeWritableByteSpan(*this), MakeByteSpan(key));
+        if (obfuscation) obfuscation(MakeWritableByteSpan(*this));
     }
 
     /** Compute total memory usage of this object (own memory + any dynamic memory). */
@@ -392,11 +366,11 @@ class AutoFile
 {
 protected:
     std::FILE* m_file;
-    std::vector<std::byte> m_xor;
+    Obfuscation m_obfuscation;
     std::optional<int64_t> m_position;
 
 public:
-    explicit AutoFile(std::FILE* file, std::vector<std::byte> data_xor={});
+    explicit AutoFile(std::FILE* file, const Obfuscation& obfuscation = 0);
 
     ~AutoFile() { fclose(); }
 
@@ -428,7 +402,7 @@ public:
     bool IsNull() const { return m_file == nullptr; }
 
     /** Continue with a different XOR key */
-    void SetXor(std::vector<std::byte> data_xor) { m_xor = data_xor; }
+    void SetObfuscation(const Obfuscation& obfuscation) { m_obfuscation = obfuscation; }
 
     /** Implementation detail, only used internally. */
     std::size_t detail_fread(std::span<std::byte> dst);
diff --git a/src/test/dbwrapper_tests.cpp b/src/test/dbwrapper_tests.cpp
index 3a86036327c..7d9135baf04 100644
--- a/src/test/dbwrapper_tests.cpp
+++ b/src/test/dbwrapper_tests.cpp
@@ -14,16 +14,6 @@
 
 using util::ToString;
 
-// Test if a string consists entirely of null characters
-static bool is_null_key(const std::vector<unsigned char>& key) {
-    bool isnull = true;
-
-    for (unsigned int i = 0; i < key.size(); i++)
-        isnull &= (key[i] == '\x00');
-
-    return isnull;
-}
-
 BOOST_FIXTURE_TEST_SUITE(dbwrapper_tests, BasicTestingSetup)
 
 BOOST_AUTO_TEST_CASE(dbwrapper)
@@ -37,7 +27,7 @@ BOOST_AUTO_TEST_CASE(dbwrapper)
         uint256 res;
 
         // Ensure that we're doing real obfuscation when obfuscate=true
-        BOOST_CHECK(obfuscate != is_null_key(dbwrapper_private::GetObfuscateKey(dbw)));
+        BOOST_CHECK(obfuscate == dbwrapper_private::GetObfuscation(dbw));
 
         BOOST_CHECK(dbw.Write(key, in));
         BOOST_CHECK(dbw.Read(key, res));
@@ -57,7 +47,7 @@ BOOST_AUTO_TEST_CASE(dbwrapper_basic_data)
         bool res_bool;
 
         // Ensure that we're doing real obfuscation when obfuscate=true
-        BOOST_CHECK(obfuscate != is_null_key(dbwrapper_private::GetObfuscateKey(dbw)));
+        BOOST_CHECK(obfuscate == dbwrapper_private::GetObfuscation(dbw));
 
         //Simulate block raw data - "b + block hash"
         std::string key_block = "b" + m_rng.rand256().ToString();
@@ -232,7 +222,7 @@ BOOST_AUTO_TEST_CASE(existing_data_no_obfuscate)
     BOOST_CHECK_EQUAL(res2.ToString(), in.ToString());
 
     BOOST_CHECK(!odbw.IsEmpty()); // There should be existing data
-    BOOST_CHECK(is_null_key(dbwrapper_private::GetObfuscateKey(odbw))); // The key should be an empty string
+    BOOST_CHECK(!dbwrapper_private::GetObfuscation(odbw));
 
     uint256 in2 = m_rng.rand256();
     uint256 res3;
@@ -269,7 +259,7 @@ BOOST_AUTO_TEST_CASE(existing_data_reindex)
     // Check that the key/val we wrote with unobfuscated wrapper doesn't exist
     uint256 res2;
     BOOST_CHECK(!odbw.Read(key, res2));
-    BOOST_CHECK(!is_null_key(dbwrapper_private::GetObfuscateKey(odbw)));
+    BOOST_CHECK(dbwrapper_private::GetObfuscation(odbw));
 
     uint256 in2 = m_rng.rand256();
     uint256 res3;
diff --git a/src/test/fuzz/autofile.cpp b/src/test/fuzz/autofile.cpp
index 81761c7bf96..c76899cda82 100644
--- a/src/test/fuzz/autofile.cpp
+++ b/src/test/fuzz/autofile.cpp
@@ -20,7 +20,7 @@ FUZZ_TARGET(autofile)
     FuzzedFileProvider fuzzed_file_provider{fuzzed_data_provider};
     AutoFile auto_file{
         fuzzed_file_provider.open(),
-        ConsumeRandomLengthByteVector<std::byte>(fuzzed_data_provider),
+        fuzzed_data_provider.ConsumeIntegral<uint64_t>()
     };
     LIMITED_WHILE(fuzzed_data_provider.ConsumeBool(), 100)
     {
diff --git a/src/test/fuzz/buffered_file.cpp b/src/test/fuzz/buffered_file.cpp
index a6a042a25cb..d8c09f88ed8 100644
--- a/src/test/fuzz/buffered_file.cpp
+++ b/src/test/fuzz/buffered_file.cpp
@@ -22,7 +22,7 @@ FUZZ_TARGET(buffered_file)
     std::optional<BufferedFile> opt_buffered_file;
     AutoFile fuzzed_file{
         fuzzed_file_provider.open(),
-        ConsumeRandomLengthByteVector<std::byte>(fuzzed_data_provider),
+        fuzzed_data_provider.ConsumeIntegral<uint64_t>()
     };
     try {
         auto n_buf_size = fuzzed_data_provider.ConsumeIntegralInRange<uint64_t>(0, 4096);
diff --git a/src/test/streams_tests.cpp b/src/test/streams_tests.cpp
index c29f34fab58..ab64825d981 100644
--- a/src/test/streams_tests.cpp
+++ b/src/test/streams_tests.cpp
@@ -20,10 +20,10 @@ BOOST_FIXTURE_TEST_SUITE(streams_tests, BasicTestingSetup)
 // Test that obfuscation can be properly reversed even with random chunk sizes.
 BOOST_AUTO_TEST_CASE(xor_roundtrip_random_chunks)
 {
-    auto apply_random_xor_chunks{[&](std::span<std::byte> write, const std::span<std::byte> key) {
+    auto apply_random_xor_chunks{[&](std::span<std::byte> write, const Obfuscation& obfuscation) {
         for (size_t offset{0}; offset < write.size();) {
             const size_t chunk_size{1 + m_rng.randrange(write.size() - offset)};
-            util::Xor(write.subspan(offset, chunk_size), key, offset);
+            obfuscation(write.subspan(offset, chunk_size), offset);
             offset += chunk_size;
         }
     }};
@@ -33,16 +33,15 @@ BOOST_AUTO_TEST_CASE(xor_roundtrip_random_chunks)
         const std::vector original{m_rng.randbytes<std::byte>(write_size)};
         std::vector roundtrip{original};
 
-        std::vector key_bytes{m_rng.randbytes<std::byte>(sizeof(uint64_t))};
-        uint64_t xor_key;
-        std::memcpy(&xor_key, key_bytes.data(), sizeof(xor_key));
-        apply_random_xor_chunks(roundtrip, key_bytes);
+        const auto key_bytes{m_rng.randbytes<std::byte>(Obfuscation::SIZE_BYTES)};
+        const Obfuscation obfuscation{key_bytes};
+        apply_random_xor_chunks(roundtrip, obfuscation);
 
         // Verify intermediate state is different from original (unless key is zero)
-        const bool all_zero = (xor_key == 0) || (HexStr(key_bytes).find_first_not_of('0') >= write_size * 2);
+        const bool all_zero = !obfuscation || (HexStr(key_bytes).find_first_not_of('0') >= write_size * 2);
         BOOST_CHECK_EQUAL(original != roundtrip, !all_zero);
 
-        apply_random_xor_chunks(roundtrip, key_bytes);
+        apply_random_xor_chunks(roundtrip, obfuscation);
         BOOST_CHECK(original == roundtrip);
     }
 }
@@ -61,20 +60,69 @@ BOOST_AUTO_TEST_CASE(xor_bytes_reference)
         const size_t write_size{1 + m_rng.randrange(100U)};
         const size_t key_offset{m_rng.randrange(3 * 8U)}; // Should wrap around
 
-        std::vector key_bytes{m_rng.randbytes<std::byte>(sizeof(uint64_t))};
-        uint64_t xor_key;
-        std::memcpy(&xor_key, key_bytes.data(), sizeof(xor_key));
-
+        const auto key_bytes{m_rng.randbytes<std::byte>(Obfuscation::SIZE_BYTES)};
+        const Obfuscation obfuscation{key_bytes};
         std::vector expected{m_rng.randbytes<std::byte>(write_size)};
         std::vector actual{expected};
 
         expected_xor(expected, key_bytes, key_offset);
-        util::Xor(actual, key_bytes, key_offset);
+        obfuscation(actual, key_offset);
 
         BOOST_CHECK_EQUAL_COLLECTIONS(expected.begin(), expected.end(), actual.begin(), actual.end());
     }
 }
 
+
+BOOST_AUTO_TEST_CASE(obfuscation_constructors)
+{
+    constexpr uint64_t test_key = 0x0123456789ABCDEF;
+
+    // Direct uint64_t constructor
+    const Obfuscation obf1{test_key};
+    BOOST_CHECK_EQUAL(obf1.Key(), test_key);
+
+    // std::span constructor
+    std::array<std::byte, Obfuscation::SIZE_BYTES> key_bytes{};
+    std::memcpy(key_bytes.data(), &test_key, Obfuscation::SIZE_BYTES);
+    const Obfuscation obf2{std::span{key_bytes}};
+    BOOST_CHECK_EQUAL(obf2.Key(), test_key);
+
+    // std::vector<uint8_t> constructor
+    std::vector<uint8_t> uint8_key(Obfuscation::SIZE_BYTES);
+    std::memcpy(uint8_key.data(), &test_key, uint8_key.size());
+    const Obfuscation obf4{uint8_key};
+    BOOST_CHECK_EQUAL(obf4.Key(), test_key);
+
+    // std::vector<std::byte> constructor
+    std::vector<std::byte> byte_vector_key(Obfuscation::SIZE_BYTES);
+    std::memcpy(byte_vector_key.data(), &test_key, byte_vector_key.size());
+    const Obfuscation obf5{byte_vector_key};
+    BOOST_CHECK_EQUAL(obf5.Key(), test_key);
+}
+
+BOOST_AUTO_TEST_CASE(obfuscation_serialize)
+{
+    const Obfuscation original{0xDEADBEEF};
+
+    // Serialize
+    DataStream ds;
+    ds << original;
+
+    BOOST_CHECK_EQUAL(ds.size(), 1 + Obfuscation::SIZE_BYTES); // serialized as a vector
+
+    // Deserialize
+    Obfuscation recovered{0};
+    ds >> recovered;
+
+    BOOST_CHECK_EQUAL(recovered.Key(), original.Key());
+}
+
+BOOST_AUTO_TEST_CASE(obfuscation_empty)
+{
+    const Obfuscation null_obf{0};
+    BOOST_CHECK(!null_obf);
+}
+
 BOOST_AUTO_TEST_CASE(xor_file)
 {
     fs::path xor_path{m_args.GetDataDirBase() / "test_xor.bin"};
@@ -99,7 +147,7 @@ BOOST_AUTO_TEST_CASE(xor_file)
 #else
         const char* mode = "wbx";
 #endif
-        AutoFile xor_file{raw_file(mode), key_bytes};
+        AutoFile xor_file{raw_file(mode), xor_key};
         xor_file << test1 << test2;
     }
     {
@@ -112,7 +160,7 @@ BOOST_AUTO_TEST_CASE(xor_file)
         BOOST_CHECK_EXCEPTION(non_xor_file.ignore(1), std::ios_base::failure, HasReason{"AutoFile::ignore: end of file"});
     }
     {
-        AutoFile xor_file{raw_file("rb"), key_bytes};
+        AutoFile xor_file{raw_file("rb"), xor_key};
         std::vector<std::byte> read1, read2;
         xor_file >> read1 >> read2;
         BOOST_CHECK_EQUAL(HexStr(read1), HexStr(test1));
@@ -121,7 +169,7 @@ BOOST_AUTO_TEST_CASE(xor_file)
         BOOST_CHECK_EXCEPTION(xor_file >> std::byte{}, std::ios_base::failure, HasReason{"AutoFile::read: end of file"});
     }
     {
-        AutoFile xor_file{raw_file("rb"), key_bytes};
+        AutoFile xor_file{raw_file("rb"), xor_key};
         std::vector<std::byte> read2;
         // Check that ignore works
         xor_file.ignore(4);
@@ -289,7 +337,7 @@ BOOST_AUTO_TEST_CASE(streams_serializedata_xor)
     // Degenerate case
     {
         DataStream ds{in};
-        ds.Xor("0000000000000000"_hex_v_u8);
+        Obfuscation{0}(ds);
         BOOST_CHECK_EQUAL(""s, ds.str());
     }
 
@@ -297,12 +345,10 @@ BOOST_AUTO_TEST_CASE(streams_serializedata_xor)
     in.push_back(std::byte{0xf0});
 
     {
-        const auto key_bytes{"ffffffffffffffff"_hex_v};
-        uint64_t xor_key;
-        std::memcpy(&xor_key, key_bytes.data(), sizeof(xor_key));
+        const Obfuscation obfuscation{"ffffffffffffffff"_hex_v};
 
         DataStream ds{in};
-        ds.Xor("ffffffffffffffff"_hex_v_u8);
+        obfuscation(ds);
         BOOST_CHECK_EQUAL("\xf0\x0f"s, ds.str());
     }
 
@@ -311,10 +357,10 @@ BOOST_AUTO_TEST_CASE(streams_serializedata_xor)
     in.push_back(std::byte{0x0f});
 
     {
-        const auto key_bytes{"ff0fff0fff0fff0f"_hex_v};
+        const Obfuscation obfuscation{"ff0fff0fff0fff0f"_hex_v};
 
         DataStream ds{in};
-        ds.Xor("ff0fff0fff0fff0f"_hex_v_u8);
+        obfuscation(ds);
         BOOST_CHECK_EQUAL("\x0f\x00"s, ds.str());
     }
 }

From c497ca6e91fd692756384e1f11e2a856793b98f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Fri, 21 Jun 2024 18:00:14 +0200
Subject: [PATCH 06/24] bench: Add COutPoint and SaltedOutpointHasher
 benchmarks

This commit introduces new benchmarks to measure the performance of various operations using
SaltedOutpointHasher, including hash computation, set operations, and set creation.

These benchmarks are intended to provide insights about coin caching performance (e.g. during IBD).

> cmake -B build -DBUILD_BENCH=ON -DCMAKE_BUILD_TYPE=Release && cmake --build build -j$(nproc) && build/src/bench/bench_bitcoin -filter='SaltedOutpointHasherBench.*' -min-time=10000

> C++ compiler .......................... AppleClang 16.0.0.16000026

|               ns/op |                op/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|               58.60 |       17,065,922.04 |    0.3% |     11.02 | `SaltedOutpointHasherBench_create_set`
|               11.97 |       83,576,684.83 |    0.1% |     11.01 | `SaltedOutpointHasherBench_hash`
|               14.50 |       68,985,850.12 |    0.3% |     10.96 | `SaltedOutpointHasherBench_match`
|               13.90 |       71,942,033.47 |    0.4% |     11.03 | `SaltedOutpointHasherBench_mismatch`

> C++ compiler .......................... GNU 13.3.0

|               ns/op |                op/s |    err% |          ins/op |          cyc/op |    IPC |         bra/op |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|              136.76 |        7,312,133.16 |    0.0% |        1,086.67 |          491.12 |  2.213 |         119.54 |    1.1% |     11.01 | `SaltedOutpointHasherBench_create_set`
|               23.82 |       41,978,882.62 |    0.0% |          252.01 |           85.57 |  2.945 |           4.00 |    0.0% |     11.00 | `SaltedOutpointHasherBench_hash`
|               60.42 |       16,549,695.42 |    0.1% |          460.51 |          217.04 |  2.122 |          21.00 |    1.4% |     10.99 | `SaltedOutpointHasherBench_match`
|               78.66 |       12,713,595.35 |    0.1% |          555.59 |          282.52 |  1.967 |          20.19 |    2.2% |     10.74 | `SaltedOutpointHasherBench_mismatch`
---
 src/bench/crypto_hash.cpp | 101 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 99 insertions(+), 2 deletions(-)

diff --git a/src/bench/crypto_hash.cpp b/src/bench/crypto_hash.cpp
index 2f1ff564388..54aa4a44ef5 100644
--- a/src/bench/crypto_hash.cpp
+++ b/src/bench/crypto_hash.cpp
@@ -2,7 +2,6 @@
 // Distributed under the MIT software license, see the accompanying
 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
 
-
 #include <bench/bench.h>
 #include <crypto/muhash.h>
 #include <crypto/ripemd160.h>
@@ -12,9 +11,11 @@
 #include <crypto/sha512.h>
 #include <crypto/siphash.h>
 #include <random.h>
-#include <span.h>
 #include <tinyformat.h>
 #include <uint256.h>
+#include <primitives/transaction.h>
+#include <util/hasher.h>
+#include <unordered_set>
 
 #include <cstdint>
 #include <vector>
@@ -205,6 +206,98 @@ static void SipHash_32b(benchmark::Bench& bench)
     });
 }
 
+static void SaltedOutpointHasherBench_hash(benchmark::Bench& bench)
+{
+    FastRandomContext rng{/*fDeterministic=*/true};
+    constexpr size_t size{1000};
+
+    std::vector<COutPoint> outpoints(size);
+    for (auto& outpoint : outpoints) {
+        outpoint = {Txid::FromUint256(rng.rand256()), rng.rand32()};
+    }
+
+    const SaltedOutpointHasher hasher;
+    bench.batch(size).run([&] {
+        size_t result{0};
+        for (const auto& outpoint : outpoints) {
+            result ^= hasher(outpoint);
+        }
+        ankerl::nanobench::doNotOptimizeAway(result);
+    });
+}
+
+static void SaltedOutpointHasherBench_match(benchmark::Bench& bench)
+{
+    FastRandomContext rng{/*fDeterministic=*/true};
+    constexpr size_t size{1000};
+
+    std::unordered_set<COutPoint, SaltedOutpointHasher> values;
+    std::vector<COutPoint> value_vector;
+    values.reserve(size);
+    value_vector.reserve(size);
+
+    for (size_t i{0}; i < size; ++i) {
+        COutPoint outpoint{Txid::FromUint256(rng.rand256()), rng.rand32()};
+        values.emplace(outpoint);
+        value_vector.push_back(outpoint);
+        assert(values.contains(outpoint));
+    }
+
+    bench.batch(size).run([&] {
+        bool result{true};
+        for (const auto& outpoint : value_vector) {
+            result ^= values.contains(outpoint);
+        }
+        ankerl::nanobench::doNotOptimizeAway(result);
+    });
+}
+
+static void SaltedOutpointHasherBench_mismatch(benchmark::Bench& bench)
+{
+    FastRandomContext rng{/*fDeterministic=*/true};
+    constexpr size_t size{1000};
+
+    std::unordered_set<COutPoint, SaltedOutpointHasher> values;
+    std::vector<COutPoint> missing_value_vector;
+    values.reserve(size);
+    missing_value_vector.reserve(size);
+
+    for (size_t i{0}; i < size; ++i) {
+        values.emplace(Txid::FromUint256(rng.rand256()), rng.rand32());
+        COutPoint missing_outpoint{Txid::FromUint256(rng.rand256()), rng.rand32()};
+        missing_value_vector.push_back(missing_outpoint);
+        assert(!values.contains(missing_outpoint));
+    }
+
+    bench.batch(size).run([&] {
+        bool result{false};
+        for (const auto& outpoint : missing_value_vector) {
+            result ^= values.contains(outpoint);
+        }
+        ankerl::nanobench::doNotOptimizeAway(result);
+    });
+}
+
+static void SaltedOutpointHasherBench_create_set(benchmark::Bench& bench)
+{
+    FastRandomContext rng{/*fDeterministic=*/true};
+    constexpr size_t size{1000};
+
+    std::vector<COutPoint> outpoints(size);
+    for (auto& outpoint : outpoints) {
+        outpoint = {Txid::FromUint256(rng.rand256()), rng.rand32()};
+    }
+
+    bench.batch(size).run([&] {
+        std::unordered_set<COutPoint, SaltedOutpointHasher> set;
+        set.reserve(size);
+        for (const auto& outpoint : outpoints) {
+            set.emplace(outpoint);
+        }
+        ankerl::nanobench::doNotOptimizeAway(set.size());
+    });
+}
+
 static void MuHash(benchmark::Bench& bench)
 {
     MuHash3072 acc;
@@ -276,6 +369,10 @@ BENCHMARK(SHA256_32b_SSE4, benchmark::PriorityLevel::HIGH);
 BENCHMARK(SHA256_32b_AVX2, benchmark::PriorityLevel::HIGH);
 BENCHMARK(SHA256_32b_SHANI, benchmark::PriorityLevel::HIGH);
 BENCHMARK(SipHash_32b, benchmark::PriorityLevel::HIGH);
+BENCHMARK(SaltedOutpointHasherBench_hash, benchmark::PriorityLevel::HIGH);
+BENCHMARK(SaltedOutpointHasherBench_match, benchmark::PriorityLevel::HIGH);
+BENCHMARK(SaltedOutpointHasherBench_mismatch, benchmark::PriorityLevel::HIGH);
+BENCHMARK(SaltedOutpointHasherBench_create_set, benchmark::PriorityLevel::HIGH);
 BENCHMARK(SHA256D64_1024_STANDARD, benchmark::PriorityLevel::HIGH);
 BENCHMARK(SHA256D64_1024_SSE4, benchmark::PriorityLevel::HIGH);
 BENCHMARK(SHA256D64_1024_AVX2, benchmark::PriorityLevel::HIGH);

From ae87260d299ab91a98afa7c28ddc69d2bf178a94 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sat, 1 Feb 2025 19:01:36 +0100
Subject: [PATCH 07/24] test: Rename k1/k2 to k0/k1 for consistency

---
 src/test/hash_tests.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/test/hash_tests.cpp b/src/test/hash_tests.cpp
index 2fe44960f46..f08f37e077a 100644
--- a/src/test/hash_tests.cpp
+++ b/src/test/hash_tests.cpp
@@ -133,18 +133,18 @@ BOOST_AUTO_TEST_CASE(siphash)
     // Check consistency between CSipHasher and SipHashUint256[Extra].
     FastRandomContext ctx;
     for (int i = 0; i < 16; ++i) {
+        uint64_t k0 = ctx.rand64();
         uint64_t k1 = ctx.rand64();
-        uint64_t k2 = ctx.rand64();
         uint256 x = m_rng.rand256();
         uint32_t n = ctx.rand32();
         uint8_t nb[4];
         WriteLE32(nb, n);
-        CSipHasher sip256(k1, k2);
+        CSipHasher sip256(k0, k1);
         sip256.Write(x);
         CSipHasher sip288 = sip256;
         sip288.Write(nb);
-        BOOST_CHECK_EQUAL(SipHashUint256(k1, k2, x), sip256.Finalize());
-        BOOST_CHECK_EQUAL(SipHashUint256Extra(k1, k2, x, n), sip288.Finalize());
+        BOOST_CHECK_EQUAL(SipHashUint256(k0, k1, x), sip256.Finalize());
+        BOOST_CHECK_EQUAL(SipHashUint256Extra(k0, k1, x, n), sip288.Finalize()); // TODO modified in follow-up commit
     }
 }
 

From 155ba7c349ca748ba6d0cfb6224eeb99fed8262a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sat, 1 Feb 2025 19:16:17 +0100
Subject: [PATCH 08/24] refactor: Extract C0-C3 Siphash constants

---
 src/crypto/siphash.cpp | 25 +++++++++++++------------
 src/crypto/siphash.h   |  5 +++++
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/crypto/siphash.cpp b/src/crypto/siphash.cpp
index dda28a6ced4..ad3d2f768d5 100644
--- a/src/crypto/siphash.cpp
+++ b/src/crypto/siphash.cpp
@@ -17,10 +17,10 @@
 
 CSipHasher::CSipHasher(uint64_t k0, uint64_t k1)
 {
-    v[0] = 0x736f6d6570736575ULL ^ k0;
-    v[1] = 0x646f72616e646f6dULL ^ k1;
-    v[2] = 0x6c7967656e657261ULL ^ k0;
-    v[3] = 0x7465646279746573ULL ^ k1;
+    v[0] = C0 ^ k0;
+    v[1] = C1 ^ k1;
+    v[2] = C2 ^ k0;
+    v[3] = C3 ^ k1;
     count = 0;
     tmp = 0;
 }
@@ -97,10 +97,10 @@ uint64_t SipHashUint256(uint64_t k0, uint64_t k1, const uint256& val)
     /* Specialized implementation for efficiency */
     uint64_t d = val.GetUint64(0);
 
-    uint64_t v0 = 0x736f6d6570736575ULL ^ k0;
-    uint64_t v1 = 0x646f72616e646f6dULL ^ k1;
-    uint64_t v2 = 0x6c7967656e657261ULL ^ k0;
-    uint64_t v3 = 0x7465646279746573ULL ^ k1 ^ d;
+    uint64_t v0 = CSipHasher::C0 ^ k0;
+    uint64_t v1 = CSipHasher::C1 ^ k1;
+    uint64_t v2 = CSipHasher::C2 ^ k0;
+    uint64_t v3 = CSipHasher::C3 ^ k1 ^ d;
 
     SIPROUND;
     SIPROUND;
@@ -137,10 +137,11 @@ uint64_t SipHashUint256Extra(uint64_t k0, uint64_t k1, const uint256& val, uint3
     /* Specialized implementation for efficiency */
     uint64_t d = val.GetUint64(0);
 
-    uint64_t v0 = 0x736f6d6570736575ULL ^ k0;
-    uint64_t v1 = 0x646f72616e646f6dULL ^ k1;
-    uint64_t v2 = 0x6c7967656e657261ULL ^ k0;
-    uint64_t v3 = 0x7465646279746573ULL ^ k1 ^ d;
+    // TODO moved in next commit
+    uint64_t v0 = CSipHasher::C0 ^ k0;
+    uint64_t v1 = CSipHasher::C1 ^ k1;
+    uint64_t v2 = CSipHasher::C2 ^ k0;
+    uint64_t v3 = CSipHasher::C3 ^ k1 ^ d;
 
     SIPROUND;
     SIPROUND;
diff --git a/src/crypto/siphash.h b/src/crypto/siphash.h
index c388c94951b..61f4ab08ca1 100644
--- a/src/crypto/siphash.h
+++ b/src/crypto/siphash.h
@@ -19,6 +19,11 @@ private:
     uint8_t count; // Only the low 8 bits of the input size matter.
 
 public:
+    static constexpr uint64_t C0{0x736f6d6570736575ULL};
+    static constexpr uint64_t C1{0x646f72616e646f6dULL};
+    static constexpr uint64_t C2{0x6c7967656e657261ULL};
+    static constexpr uint64_t C3{0x7465646279746573ULL};
+
     /** Construct a SipHash calculator initialized with 128-bit key (k0, k1) */
     CSipHasher(uint64_t k0, uint64_t k1);
     /** Hash a 64-bit integer worth of data

From 73cfebb08bda9f2e48148f6cd2481a8c3b07a43c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sat, 1 Feb 2025 19:21:34 +0100
Subject: [PATCH 09/24] optimization: refactor: Introduce Uint256ExtraSipHasher
 to cache SipHash constant state

Previously, only k0 and k1 were stored, causing the constant xor operations to be recomputed in every call to `SipHashUint256Extra`.
This commit adds a dedicated `Uint256ExtraSipHasher` class that caches the initial state (v0-v3) and to perform the `SipHash` computation on a `uint256` (with an extra parameter), hiding the constant computation details from higher-level code and improving efficiency.
This basically brings the precalculations in the `CSipHasher` constructor to the `uint256` specialized SipHash implementation.

> cmake -B build -DBUILD_BENCH=ON -DCMAKE_BUILD_TYPE=Release && cmake --build build -j$(nproc) && build/src/bench/bench_bitcoin -filter='SaltedOutpointHasherBench.*' -min-time=10000

> C++ compiler .......................... AppleClang 16.0.0.16000026

|               ns/op |                op/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|               57.27 |       17,462,299.19 |    0.1% |     11.02 | `SaltedOutpointHasherBench_create_set`
|               11.24 |       88,997,888.48 |    0.3% |     11.04 | `SaltedOutpointHasherBench_hash`
|               13.91 |       71,902,014.20 |    0.2% |     11.01 | `SaltedOutpointHasherBench_match`
|               13.29 |       75,230,390.31 |    0.1% |     11.00 | `SaltedOutpointHasherBench_mismatch`

compared to master:
create_set - 17,462,299.19/17,065,922.04 - 2.3% faster
hash       - 88,997,888.48/83,576,684.83 - 6.4% faster
match      - 71,902,014.20/68,985,850.12 - 4.2% faster
mismatch   - 75,230,390.31/71,942,033.47 - 4.5% faster

> C++ compiler .......................... GNU 13.3.0

|               ns/op |                op/s |    err% |          ins/op |          cyc/op |    IPC |         bra/op |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|              135.38 |        7,386,349.49 |    0.0% |        1,078.19 |          486.16 |  2.218 |         119.56 |    1.1% |     11.00 | `SaltedOutpointHasherBench_create_set`
|               23.67 |       42,254,558.08 |    0.0% |          247.01 |           85.01 |  2.906 |           4.00 |    0.0% |     11.00 | `SaltedOutpointHasherBench_hash`
|               58.95 |       16,962,220.14 |    0.1% |          446.55 |          211.74 |  2.109 |          20.86 |    1.4% |     11.01 | `SaltedOutpointHasherBench_match`
|               76.98 |       12,991,047.69 |    0.1% |          548.93 |          276.50 |  1.985 |          20.25 |    2.3% |     10.72 | `SaltedOutpointHasherBench_mismatch`

compared to master:
create_set -  7,386,349.49/7,312,133.16  - 1% faster
hash       - 42,254,558.08/41,978,882.62 - 0.6% faster
match      - 16,962,220.14/16,549,695.42 - 2.4% faster
mismatch   - 12,991,047.69/12,713,595.35 - 2% faster

Co-authored-by: sipa <pieter@wuille.net>
---
 src/crypto/siphash.cpp    | 13 ++++---------
 src/crypto/siphash.h      | 15 ++++++++++++++-
 src/test/fuzz/integer.cpp |  2 +-
 src/test/hash_tests.cpp   |  4 ++--
 src/util/hasher.cpp       |  6 +++---
 src/util/hasher.h         |  8 +++-----
 6 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/src/crypto/siphash.cpp b/src/crypto/siphash.cpp
index ad3d2f768d5..15287f3283b 100644
--- a/src/crypto/siphash.cpp
+++ b/src/crypto/siphash.cpp
@@ -132,17 +132,12 @@ uint64_t SipHashUint256(uint64_t k0, uint64_t k1, const uint256& val)
     return v0 ^ v1 ^ v2 ^ v3;
 }
 
-uint64_t SipHashUint256Extra(uint64_t k0, uint64_t k1, const uint256& val, uint32_t extra)
+/* Specialized implementation for efficiency */
+uint64_t Uint256ExtraSipHasher::operator()(const uint256& val, uint32_t extra) const noexcept
 {
-    /* Specialized implementation for efficiency */
+    uint64_t v0 = v[0], v1 = v[1], v2 = v[2], v3 = v[3];
     uint64_t d = val.GetUint64(0);
-
-    // TODO moved in next commit
-    uint64_t v0 = CSipHasher::C0 ^ k0;
-    uint64_t v1 = CSipHasher::C1 ^ k1;
-    uint64_t v2 = CSipHasher::C2 ^ k0;
-    uint64_t v3 = CSipHasher::C3 ^ k1 ^ d;
-
+    v3 ^= d;
     SIPROUND;
     SIPROUND;
     v0 ^= d;
diff --git a/src/crypto/siphash.h b/src/crypto/siphash.h
index 61f4ab08ca1..a8dd6da3ecd 100644
--- a/src/crypto/siphash.h
+++ b/src/crypto/siphash.h
@@ -48,6 +48,19 @@ public:
  *      .Finalize()
  */
 uint64_t SipHashUint256(uint64_t k0, uint64_t k1, const uint256& val);
-uint64_t SipHashUint256Extra(uint64_t k0, uint64_t k1, const uint256& val, uint32_t extra);
+
+class Uint256ExtraSipHasher {
+    uint64_t v[4];
+
+public:
+    Uint256ExtraSipHasher(const uint64_t k0, const uint64_t k1) noexcept {
+        v[0] = CSipHasher::C0 ^ k0;
+        v[1] = CSipHasher::C1 ^ k1;
+        v[2] = CSipHasher::C2 ^ k0;
+        v[3] = CSipHasher::C3 ^ k1;
+    }
+
+    uint64_t operator()(const uint256& val, uint32_t extra) const noexcept;
+};
 
 #endif // BITCOIN_CRYPTO_SIPHASH_H
diff --git a/src/test/fuzz/integer.cpp b/src/test/fuzz/integer.cpp
index a6729155d1f..66a95533ee1 100644
--- a/src/test/fuzz/integer.cpp
+++ b/src/test/fuzz/integer.cpp
@@ -119,7 +119,7 @@ FUZZ_TARGET(integer, .init = initialize_integer)
     (void)MillisToTimeval(i64);
     (void)SighashToStr(uch);
     (void)SipHashUint256(u64, u64, u256);
-    (void)SipHashUint256Extra(u64, u64, u256, u32);
+    (void)Uint256ExtraSipHasher(u64, u64)(u256, u32);
     (void)ToLower(ch);
     (void)ToUpper(ch);
     {
diff --git a/src/test/hash_tests.cpp b/src/test/hash_tests.cpp
index f08f37e077a..60f4bbc078b 100644
--- a/src/test/hash_tests.cpp
+++ b/src/test/hash_tests.cpp
@@ -130,7 +130,7 @@ BOOST_AUTO_TEST_CASE(siphash)
     ss << TX_WITH_WITNESS(tx);
     BOOST_CHECK_EQUAL(SipHashUint256(1, 2, ss.GetHash()), 0x79751e980c2a0a35ULL);
 
-    // Check consistency between CSipHasher and SipHashUint256[Extra].
+    // Check consistency between CSipHasher and SipHashUint256 and Uint256ExtraSipHasher.
     FastRandomContext ctx;
     for (int i = 0; i < 16; ++i) {
         uint64_t k0 = ctx.rand64();
@@ -144,7 +144,7 @@ BOOST_AUTO_TEST_CASE(siphash)
         CSipHasher sip288 = sip256;
         sip288.Write(nb);
         BOOST_CHECK_EQUAL(SipHashUint256(k0, k1, x), sip256.Finalize());
-        BOOST_CHECK_EQUAL(SipHashUint256Extra(k0, k1, x, n), sip288.Finalize()); // TODO modified in follow-up commit
+        BOOST_CHECK_EQUAL(Uint256ExtraSipHasher(k0, k1)(x, n), sip288.Finalize());
     }
 }
 
diff --git a/src/util/hasher.cpp b/src/util/hasher.cpp
index 117cfe8dcd1..6ee3134cb7c 100644
--- a/src/util/hasher.cpp
+++ b/src/util/hasher.cpp
@@ -11,9 +11,9 @@ SaltedTxidHasher::SaltedTxidHasher() :
     k0{FastRandomContext().rand64()},
     k1{FastRandomContext().rand64()} {}
 
-SaltedOutpointHasher::SaltedOutpointHasher(bool deterministic) :
-    k0{deterministic ? 0x8e819f2607a18de6 : FastRandomContext().rand64()},
-    k1{deterministic ? 0xf4020d2e3983b0eb : FastRandomContext().rand64()}
+SaltedOutpointHasher::SaltedOutpointHasher(bool deterministic) : hasher{
+    deterministic ? 0x8e819f2607a18de6 : FastRandomContext().rand64(),
+    deterministic ? 0xf4020d2e3983b0eb : FastRandomContext().rand64()}
 {}
 
 SaltedSipHasher::SaltedSipHasher() :
diff --git a/src/util/hasher.h b/src/util/hasher.h
index 3a75c91a3ef..eb5061ce2fb 100644
--- a/src/util/hasher.h
+++ b/src/util/hasher.h
@@ -30,12 +30,10 @@ public:
 
 class SaltedOutpointHasher
 {
-private:
-    /** Salt */
-    const uint64_t k0, k1;
+    const Uint256ExtraSipHasher hasher;
 
 public:
-    SaltedOutpointHasher(bool deterministic = false);
+    explicit SaltedOutpointHasher(bool deterministic = false);
 
     /**
      * Having the hash noexcept allows libstdc++'s unordered_map to recalculate
@@ -47,7 +45,7 @@ public:
      * @see https://gcc.gnu.org/onlinedocs/gcc-13.2.0/libstdc++/manual/manual/unordered_associative.html
      */
     size_t operator()(const COutPoint& id) const noexcept {
-        return SipHashUint256Extra(k0, k1, id.hash, id.n);
+        return hasher(id.hash, id.n);
     }
 };
 

From 3d7c8ae9fb4dacfa6126efdce6bc8c17122218f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Tue, 19 Nov 2024 16:11:56 +0100
Subject: [PATCH 10/24] bench: measure block (size)serialization speed

The SizeComputer is a special serializer which returns what the exact final size will be of the serialized content.

> cmake -B build -DBUILD_BENCH=ON -DCMAKE_BUILD_TYPE=Release && cmake --build build -j$(nproc) && build/bin/bench_bitcoin -filter='SizeComputerBlock|SerializeBlock' --min-time=10000

> C compiler ............................ AppleClang 16.0.0.16000026

|            ns/block |             block/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|          195,610.62 |            5,112.20 |    0.3% |     11.00 | `SerializeBlock`
|           12,061.83 |           82,906.19 |    0.1% |     11.01 | `SizeComputerBlock`

> C++ compiler .......................... GNU 13.3.0

|            ns/block |             block/s |    err% |       ins/block |       cyc/block |    IPC |      bra/block |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|          867,857.55 |            1,152.26 |    0.0% |    8,015,883.90 |    3,116,099.08 |  2.572 |   1,517,035.87 |    0.5% |     10.81 | `SerializeBlock`
|           30,928.27 |           32,332.88 |    0.0% |      221,683.03 |      111,055.84 |  1.996 |      53,037.03 |    0.8% |     11.03 | `SizeComputerBlock`
---
 src/bench/checkblock.cpp | 33 +++++++++++++++++++++++++++++----
 src/streams.h            |  1 +
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/src/bench/checkblock.cpp b/src/bench/checkblock.cpp
index 9558d64f199..ba26457b31a 100644
--- a/src/bench/checkblock.cpp
+++ b/src/bench/checkblock.cpp
@@ -21,11 +21,34 @@
 #include <optional>
 #include <vector>
 
+static void SizeComputerBlock(benchmark::Bench& bench) {
+    CBlock block;
+    DataStream(benchmark::data::block413567) >> TX_WITH_WITNESS(block);
+
+    bench.unit("block").run([&] {
+        SizeComputer size_computer;
+        size_computer << TX_WITH_WITNESS(block);
+        assert(size_computer.size() == benchmark::data::block413567.size());
+    });
+}
+
+static void SerializeBlock(benchmark::Bench& bench) {
+    CBlock block;
+    DataStream(benchmark::data::block413567) >> TX_WITH_WITNESS(block);
+
+    // Create output stream and verify first serialization matches input
+    bench.unit("block").run([&] {
+        DataStream output_stream(benchmark::data::block413567.size());
+        output_stream << TX_WITH_WITNESS(block);
+        assert(output_stream.size() == benchmark::data::block413567.size());
+    });
+}
+
 // These are the two major time-sinks which happen after we have fully received
 // a block off the wire, but before we can relay the block on to peers using
 // compact block relay.
 
-static void DeserializeBlockTest(benchmark::Bench& bench)
+static void DeserializeBlock(benchmark::Bench& bench)
 {
     DataStream stream(benchmark::data::block413567);
     std::byte a{0};
@@ -39,7 +62,7 @@ static void DeserializeBlockTest(benchmark::Bench& bench)
     });
 }
 
-static void DeserializeAndCheckBlockTest(benchmark::Bench& bench)
+static void DeserializeAndCheckBlock(benchmark::Bench& bench)
 {
     DataStream stream(benchmark::data::block413567);
     std::byte a{0};
@@ -60,5 +83,7 @@ static void DeserializeAndCheckBlockTest(benchmark::Bench& bench)
     });
 }
 
-BENCHMARK(DeserializeBlockTest, benchmark::PriorityLevel::HIGH);
-BENCHMARK(DeserializeAndCheckBlockTest, benchmark::PriorityLevel::HIGH);
+BENCHMARK(SizeComputerBlock, benchmark::PriorityLevel::HIGH);
+BENCHMARK(SerializeBlock, benchmark::PriorityLevel::HIGH);
+BENCHMARK(DeserializeBlock, benchmark::PriorityLevel::HIGH);
+BENCHMARK(DeserializeAndCheckBlock, benchmark::PriorityLevel::HIGH);
diff --git a/src/streams.h b/src/streams.h
index 0a1e7002c5f..81f4dc8c78e 100644
--- a/src/streams.h
+++ b/src/streams.h
@@ -141,6 +141,7 @@ public:
     typedef vector_type::reverse_iterator reverse_iterator;
 
     explicit DataStream() = default;
+    explicit DataStream(size_type n) { reserve(n); }
     explicit DataStream(std::span<const uint8_t> sp) : DataStream{std::as_bytes(sp)} {}
     explicit DataStream(std::span<const value_type> sp) : vch(sp.data(), sp.data() + sp.size()) {}
 

From 9f15d4da35f1e4601bdc59810cfd58479c4c822f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Thu, 13 Feb 2025 22:49:51 +0100
Subject: [PATCH 11/24] cleanup: remove unused `ser_writedata16be` and
 `ser_readdata16be`

---
 src/serialize.h           | 11 -----------
 src/test/fuzz/integer.cpp |  4 ----
 2 files changed, 15 deletions(-)

diff --git a/src/serialize.h b/src/serialize.h
index 98851056bdb..6807a2be229 100644
--- a/src/serialize.h
+++ b/src/serialize.h
@@ -60,11 +60,6 @@ template<typename Stream> inline void ser_writedata16(Stream &s, uint16_t obj)
     obj = htole16_internal(obj);
     s.write(std::as_bytes(std::span{&obj, 1}));
 }
-template<typename Stream> inline void ser_writedata16be(Stream &s, uint16_t obj)
-{
-    obj = htobe16_internal(obj);
-    s.write(std::as_bytes(std::span{&obj, 1}));
-}
 template<typename Stream> inline void ser_writedata32(Stream &s, uint32_t obj)
 {
     obj = htole32_internal(obj);
@@ -92,12 +87,6 @@ template<typename Stream> inline uint16_t ser_readdata16(Stream &s)
     s.read(std::as_writable_bytes(std::span{&obj, 1}));
     return le16toh_internal(obj);
 }
-template<typename Stream> inline uint16_t ser_readdata16be(Stream &s)
-{
-    uint16_t obj;
-    s.read(std::as_writable_bytes(std::span{&obj, 1}));
-    return be16toh_internal(obj);
-}
 template<typename Stream> inline uint32_t ser_readdata32(Stream &s)
 {
     uint32_t obj;
diff --git a/src/test/fuzz/integer.cpp b/src/test/fuzz/integer.cpp
index 66a95533ee1..ddf49be86ae 100644
--- a/src/test/fuzz/integer.cpp
+++ b/src/test/fuzz/integer.cpp
@@ -236,10 +236,6 @@ FUZZ_TARGET(integer, .init = initialize_integer)
         const uint16_t deserialized_u16 = ser_readdata16(stream);
         assert(u16 == deserialized_u16 && stream.empty());
 
-        ser_writedata16be(stream, u16);
-        const uint16_t deserialized_u16be = ser_readdata16be(stream);
-        assert(u16 == deserialized_u16be && stream.empty());
-
         ser_writedata8(stream, u8);
         const uint8_t deserialized_u8 = ser_readdata8(stream);
         assert(u8 == deserialized_u8 && stream.empty());

From 5559eb68a9a5857422d06b3a9f15ea539b7581d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Fri, 17 Jan 2025 14:29:33 +0100
Subject: [PATCH 12/24] refactor: reduce template bloat in primitive
 serialization

Merged multiple template methods into single constexpr-delimited implementation to reduce template bloat (i.e. related functionality is grouped into a single method, but can be optimized because of C++20 constexpr conditions).
This unifies related methods that were only bound before by similar signatures - and enables `SizeComputer` optimizations later
---
 src/serialize.h | 53 +++++++++++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/src/serialize.h b/src/serialize.h
index 6807a2be229..48ddd4d565a 100644
--- a/src/serialize.h
+++ b/src/serialize.h
@@ -241,38 +241,47 @@ const Out& AsBase(const In& x)
 template<class T>
 concept CharNotInt8 = std::same_as<T, char> && !std::same_as<T, int8_t>;
 
+template <typename T>
+concept ByteOrIntegral = std::is_same_v<T, std::byte> ||
+    (std::is_integral_v<T> && !std::is_same_v<T, char>);
+
 template <typename Stream, CharNotInt8 V> void Serialize(Stream&, V) = delete; // char serialization forbidden. Use uint8_t or int8_t
-template <typename Stream> void Serialize(Stream& s, std::byte a) { ser_writedata8(s, uint8_t(a)); }
-template<typename Stream> inline void Serialize(Stream& s, int8_t a  ) { ser_writedata8(s, a); }
-template<typename Stream> inline void Serialize(Stream& s, uint8_t a ) { ser_writedata8(s, a); }
-template<typename Stream> inline void Serialize(Stream& s, int16_t a ) { ser_writedata16(s, a); }
-template<typename Stream> inline void Serialize(Stream& s, uint16_t a) { ser_writedata16(s, a); }
-template<typename Stream> inline void Serialize(Stream& s, int32_t a ) { ser_writedata32(s, a); }
-template<typename Stream> inline void Serialize(Stream& s, uint32_t a) { ser_writedata32(s, a); }
-template<typename Stream> inline void Serialize(Stream& s, int64_t a ) { ser_writedata64(s, a); }
-template<typename Stream> inline void Serialize(Stream& s, uint64_t a) { ser_writedata64(s, a); }
+template <typename Stream, ByteOrIntegral T> void Serialize(Stream& s, T a)
+{
+    if constexpr (sizeof(T) == 1) {
+        ser_writedata8(s, static_cast<uint8_t>(a));   // (u)int8_t or std::byte or bool
+    } else if constexpr (sizeof(T) == 2) {
+        ser_writedata16(s, static_cast<uint16_t>(a)); // (u)int16_t
+    } else if constexpr (sizeof(T) == 4) {
+        ser_writedata32(s, static_cast<uint32_t>(a)); // (u)int32_t
+    } else {
+        static_assert(sizeof(T) == 8);
+        ser_writedata64(s, static_cast<uint64_t>(a)); // (u)int64_t
+    }
+}
 template <typename Stream, BasicByte B, int N> void Serialize(Stream& s, const B (&a)[N]) { s.write(MakeByteSpan(a)); }
 template <typename Stream, BasicByte B, std::size_t N> void Serialize(Stream& s, const std::array<B, N>& a) { s.write(MakeByteSpan(a)); }
 template <typename Stream, BasicByte B, std::size_t N> void Serialize(Stream& s, std::span<B, N> span) { s.write(std::as_bytes(span)); }
 template <typename Stream, BasicByte B> void Serialize(Stream& s, std::span<B> span) { s.write(std::as_bytes(span)); }
 
 template <typename Stream, CharNotInt8 V> void Unserialize(Stream&, V) = delete; // char serialization forbidden. Use uint8_t or int8_t
-template <typename Stream> void Unserialize(Stream& s, std::byte& a) { a = std::byte{ser_readdata8(s)}; }
-template<typename Stream> inline void Unserialize(Stream& s, int8_t& a  ) { a = ser_readdata8(s); }
-template<typename Stream> inline void Unserialize(Stream& s, uint8_t& a ) { a = ser_readdata8(s); }
-template<typename Stream> inline void Unserialize(Stream& s, int16_t& a ) { a = ser_readdata16(s); }
-template<typename Stream> inline void Unserialize(Stream& s, uint16_t& a) { a = ser_readdata16(s); }
-template<typename Stream> inline void Unserialize(Stream& s, int32_t& a ) { a = ser_readdata32(s); }
-template<typename Stream> inline void Unserialize(Stream& s, uint32_t& a) { a = ser_readdata32(s); }
-template<typename Stream> inline void Unserialize(Stream& s, int64_t& a ) { a = ser_readdata64(s); }
-template<typename Stream> inline void Unserialize(Stream& s, uint64_t& a) { a = ser_readdata64(s); }
+template <typename Stream, ByteOrIntegral T> void Unserialize(Stream& s, T& a)
+{
+    if constexpr (sizeof(T) == 1) {
+        a = static_cast<T>(ser_readdata8(s));  // (u)int8_t or std::byte or bool
+    } else if constexpr (sizeof(T) == 2) {
+        a = static_cast<T>(ser_readdata16(s)); // (u)int16_t
+    } else if constexpr (sizeof(T) == 4) {
+        a = static_cast<T>(ser_readdata32(s)); // (u)int32_t
+    } else {
+        static_assert(sizeof(T) == 8);
+        a = static_cast<T>(ser_readdata64(s)); // (u)int64_t
+    }
+}
 template <typename Stream, BasicByte B, int N> void Unserialize(Stream& s, B (&a)[N]) { s.read(MakeWritableByteSpan(a)); }
 template <typename Stream, BasicByte B, std::size_t N> void Unserialize(Stream& s, std::array<B, N>& a) { s.read(MakeWritableByteSpan(a)); }
 template <typename Stream, BasicByte B, std::size_t N> void Unserialize(Stream& s, std::span<B, N> span) { s.read(std::as_writable_bytes(span)); }
 template <typename Stream, BasicByte B> void Unserialize(Stream& s, std::span<B> span) { s.read(std::as_writable_bytes(span)); }
-
-template <typename Stream> inline void Serialize(Stream& s, bool a) { uint8_t f = a; ser_writedata8(s, f); }
-template <typename Stream> inline void Unserialize(Stream& s, bool& a) { uint8_t f = ser_readdata8(s); a = f; }
 // clang-format on
 
 
@@ -478,7 +487,7 @@ public:
  * serialization, and Unser(stream, object&) for deserialization. Serialization routines (inside
  * READWRITE, or directly with << and >> operators), can then use Using<Formatter>(object).
  *
- * This works by constructing a Wrapper<Formatter, T>-wrapped version of object, where T is
+ * This works by constructing a Wrapper<Formatter, T&>-wrapped version of object, where T is
  * const during serialization, and non-const during deserialization, which maintains const
  * correctness.
  */

From 8f71de5f8f983a42732887067215e5444e7f413e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Thu, 20 Mar 2025 10:28:52 +0100
Subject: [PATCH 13/24] refactor: add explicit static extent to spans

---
 src/serialize.h | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/src/serialize.h b/src/serialize.h
index 48ddd4d565a..c1c145fa4b5 100644
--- a/src/serialize.h
+++ b/src/serialize.h
@@ -53,56 +53,56 @@ constexpr deserialize_type deserialize {};
  */
 template<typename Stream> inline void ser_writedata8(Stream &s, uint8_t obj)
 {
-    s.write(std::as_bytes(std::span{&obj, 1}));
+    s.write(std::as_bytes(std::span<uint8_t, 1>{&obj, 1}));
 }
 template<typename Stream> inline void ser_writedata16(Stream &s, uint16_t obj)
 {
     obj = htole16_internal(obj);
-    s.write(std::as_bytes(std::span{&obj, 1}));
+    s.write(std::as_bytes(std::span<uint16_t, 1>{&obj, 1}));
 }
 template<typename Stream> inline void ser_writedata32(Stream &s, uint32_t obj)
 {
     obj = htole32_internal(obj);
-    s.write(std::as_bytes(std::span{&obj, 1}));
+    s.write(std::as_bytes(std::span<uint32_t, 1>{&obj, 1}));
 }
 template<typename Stream> inline void ser_writedata32be(Stream &s, uint32_t obj)
 {
     obj = htobe32_internal(obj);
-    s.write(std::as_bytes(std::span{&obj, 1}));
+    s.write(std::as_bytes(std::span<uint32_t, 1>{&obj, 1}));
 }
 template<typename Stream> inline void ser_writedata64(Stream &s, uint64_t obj)
 {
     obj = htole64_internal(obj);
-    s.write(std::as_bytes(std::span{&obj, 1}));
+    s.write(std::as_bytes(std::span<uint64_t, 1>{&obj, 1}));
 }
 template<typename Stream> inline uint8_t ser_readdata8(Stream &s)
 {
     uint8_t obj;
-    s.read(std::as_writable_bytes(std::span{&obj, 1}));
+    s.read(std::as_writable_bytes(std::span<uint8_t, 1>{&obj, 1}));
     return obj;
 }
 template<typename Stream> inline uint16_t ser_readdata16(Stream &s)
 {
     uint16_t obj;
-    s.read(std::as_writable_bytes(std::span{&obj, 1}));
+    s.read(std::as_writable_bytes(std::span<uint16_t, 1>{&obj, 1}));
     return le16toh_internal(obj);
 }
 template<typename Stream> inline uint32_t ser_readdata32(Stream &s)
 {
     uint32_t obj;
-    s.read(std::as_writable_bytes(std::span{&obj, 1}));
+    s.read(std::as_writable_bytes(std::span<uint32_t, 1>{&obj, 1}));
     return le32toh_internal(obj);
 }
 template<typename Stream> inline uint32_t ser_readdata32be(Stream &s)
 {
     uint32_t obj;
-    s.read(std::as_writable_bytes(std::span{&obj, 1}));
+    s.read(std::as_writable_bytes(std::span<uint32_t, 1>{&obj, 1}));
     return be32toh_internal(obj);
 }
 template<typename Stream> inline uint64_t ser_readdata64(Stream &s)
 {
     uint64_t obj;
-    s.read(std::as_writable_bytes(std::span{&obj, 1}));
+    s.read(std::as_writable_bytes(std::span<uint64_t, 1>{&obj, 1}));
     return le64toh_internal(obj);
 }
 
@@ -281,7 +281,6 @@ template <typename Stream, ByteOrIntegral T> void Unserialize(Stream& s, T& a)
 template <typename Stream, BasicByte B, int N> void Unserialize(Stream& s, B (&a)[N]) { s.read(MakeWritableByteSpan(a)); }
 template <typename Stream, BasicByte B, std::size_t N> void Unserialize(Stream& s, std::array<B, N>& a) { s.read(MakeWritableByteSpan(a)); }
 template <typename Stream, BasicByte B, std::size_t N> void Unserialize(Stream& s, std::span<B, N> span) { s.read(std::as_writable_bytes(span)); }
-template <typename Stream, BasicByte B> void Unserialize(Stream& s, std::span<B> span) { s.read(std::as_writable_bytes(span)); }
 // clang-format on
 
 
@@ -534,10 +533,10 @@ struct CustomUintFormatter
         if (v < 0 || v > MAX) throw std::ios_base::failure("CustomUintFormatter value out of range");
         if (BigEndian) {
             uint64_t raw = htobe64_internal(v);
-            s.write(std::as_bytes(std::span{&raw, 1}).last(Bytes));
+            s.write(std::as_bytes(std::span{&raw, 1}).template last<Bytes>());
         } else {
             uint64_t raw = htole64_internal(v);
-            s.write(std::as_bytes(std::span{&raw, 1}).first(Bytes));
+            s.write(std::as_bytes(std::span{&raw, 1}).template first<Bytes>());
         }
     }
 
@@ -547,10 +546,10 @@ struct CustomUintFormatter
         static_assert(std::numeric_limits<U>::max() >= MAX && std::numeric_limits<U>::min() <= 0, "Assigned type too small");
         uint64_t raw = 0;
         if (BigEndian) {
-            s.read(std::as_writable_bytes(std::span{&raw, 1}).last(Bytes));
+            s.read(std::as_writable_bytes(std::span{&raw, 1}).last<Bytes>());
             v = static_cast<I>(be64toh_internal(raw));
         } else {
-            s.read(std::as_writable_bytes(std::span{&raw, 1}).first(Bytes));
+            s.read(std::as_writable_bytes(std::span{&raw, 1}).first<Bytes>());
             v = static_cast<I>(le64toh_internal(raw));
         }
     }

From 2bf6c56cabf4ae0860fd98b46c4808bf242774d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Fri, 14 Feb 2025 13:54:57 +0100
Subject: [PATCH 14/24] optimization: merge SizeComputer specializations + add
 new ones

Endianness doesn't affect the final size, we can skip it for `SizeComputer`.
We can `if constexpr` previous calls into existing method, short-circuiting existing logic when we only need their serialized sizes.

> cmake -B build -DBUILD_BENCH=ON -DCMAKE_BUILD_TYPE=Release && cmake --build build -j$(nproc) && build/src/bench/bench_bitcoin -filter='SizeComputerBlock|SerializeBlock' --min-time=10000

> C compiler ............................ AppleClang 16.0.0.16000026

|            ns/block |             block/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|          191,652.29 |            5,217.78 |    0.4% |     10.96 | `SerializeBlock`
|           10,323.55 |           96,865.92 |    0.2% |     11.01 | `SizeComputerBlock`

> C++ compiler .......................... GNU 13.3.0

|            ns/block |             block/s |    err% |       ins/block |       cyc/block |    IPC |      bra/block |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|          614,847.32 |            1,626.42 |    0.0% |    8,015,883.64 |    2,207,628.07 |  3.631 |   1,517,035.62 |    0.5% |     10.56 | `SerializeBlock`
|           26,020.31 |           38,431.52 |    0.0% |      159,390.03 |       93,438.33 |  1.706 |      42,131.03 |    0.9% |     11.00 | `SizeComputerBlock`
---
 src/serialize.h | 115 ++++++++++++++++++++++++++++++------------------
 1 file changed, 73 insertions(+), 42 deletions(-)

diff --git a/src/serialize.h b/src/serialize.h
index c1c145fa4b5..611263ac086 100644
--- a/src/serialize.h
+++ b/src/serialize.h
@@ -48,6 +48,16 @@ static const unsigned int MAX_VECTOR_ALLOCATE = 5000000;
 struct deserialize_type {};
 constexpr deserialize_type deserialize {};
 
+class SizeComputer;
+
+//! Check if type contains a stream by seeing if it has a GetStream() method.
+template<typename T>
+concept ContainsStream = requires(T t) { t.GetStream(); };
+
+template<typename T>
+concept ContainsSizeComputer = ContainsStream<T> &&
+    std::is_same_v<std::remove_reference_t<decltype(std::declval<T>().GetStream())>, SizeComputer>;
+
 /*
  * Lowest-level serialization and conversion.
  */
@@ -107,8 +117,6 @@ template<typename Stream> inline uint64_t ser_readdata64(Stream &s)
 }
 
 
-class SizeComputer;
-
 /**
  * Convert any argument to a reference to X, maintaining constness.
  *
@@ -248,7 +256,9 @@ concept ByteOrIntegral = std::is_same_v<T, std::byte> ||
 template <typename Stream, CharNotInt8 V> void Serialize(Stream&, V) = delete; // char serialization forbidden. Use uint8_t or int8_t
 template <typename Stream, ByteOrIntegral T> void Serialize(Stream& s, T a)
 {
-    if constexpr (sizeof(T) == 1) {
+    if constexpr (ContainsSizeComputer<Stream>) {
+        s.GetStream().seek(sizeof(T));
+    } else if constexpr (sizeof(T) == 1) {
         ser_writedata8(s, static_cast<uint8_t>(a));   // (u)int8_t or std::byte or bool
     } else if constexpr (sizeof(T) == 2) {
         ser_writedata16(s, static_cast<uint16_t>(a)); // (u)int16_t
@@ -259,10 +269,38 @@ template <typename Stream, ByteOrIntegral T> void Serialize(Stream& s, T a)
         ser_writedata64(s, static_cast<uint64_t>(a)); // (u)int64_t
     }
 }
-template <typename Stream, BasicByte B, int N> void Serialize(Stream& s, const B (&a)[N]) { s.write(MakeByteSpan(a)); }
-template <typename Stream, BasicByte B, std::size_t N> void Serialize(Stream& s, const std::array<B, N>& a) { s.write(MakeByteSpan(a)); }
-template <typename Stream, BasicByte B, std::size_t N> void Serialize(Stream& s, std::span<B, N> span) { s.write(std::as_bytes(span)); }
-template <typename Stream, BasicByte B> void Serialize(Stream& s, std::span<B> span) { s.write(std::as_bytes(span)); }
+template <typename Stream, BasicByte B, int N> void Serialize(Stream& s, const B (&a)[N])
+{
+    if constexpr (ContainsSizeComputer<Stream>) {
+        s.GetStream().seek(N);
+    } else {
+        s.write(MakeByteSpan(a));
+    }
+}
+template <typename Stream, BasicByte B, std::size_t N> void Serialize(Stream& s, const std::array<B, N>& a)
+{
+    if constexpr (ContainsSizeComputer<Stream>) {
+        s.GetStream().seek(N);
+    } else {
+        s.write(MakeByteSpan(a));
+    }
+}
+template <typename Stream, BasicByte B, std::size_t N> void Serialize(Stream& s, std::span<B, N> span)
+{
+    if constexpr (ContainsSizeComputer<Stream>) {
+        s.GetStream().seek(N);
+    } else {
+        s.write(std::as_bytes(span));
+    }
+}
+template <typename Stream, BasicByte B> void Serialize(Stream& s, std::span<B> span)
+{
+    if constexpr (ContainsSizeComputer<Stream>) {
+        s.GetStream().seek(span.size());
+    } else {
+        s.write(std::as_bytes(span));
+    }
+}
 
 template <typename Stream, CharNotInt8 V> void Unserialize(Stream&, V) = delete; // char serialization forbidden. Use uint8_t or int8_t
 template <typename Stream, ByteOrIntegral T> void Unserialize(Stream& s, T& a)
@@ -299,12 +337,14 @@ constexpr inline unsigned int GetSizeOfCompactSize(uint64_t nSize)
     else                         return sizeof(unsigned char) + sizeof(uint64_t);
 }
 
-inline void WriteCompactSize(SizeComputer& os, uint64_t nSize);
-
 template<typename Stream>
 void WriteCompactSize(Stream& os, uint64_t nSize)
 {
-    if (nSize < 253)
+    if constexpr (ContainsSizeComputer<Stream>)
+    {
+        os.GetStream().seek(GetSizeOfCompactSize(nSize));
+    }
+    else if (nSize < 253)
     {
         ser_writedata8(os, nSize);
     }
@@ -411,7 +451,7 @@ struct CheckVarIntMode {
 };
 
 template<VarIntMode Mode, typename I>
-inline unsigned int GetSizeOfVarInt(I n)
+constexpr unsigned int GetSizeOfVarInt(I n)
 {
     CheckVarIntMode<Mode, I>();
     int nRet = 0;
@@ -424,25 +464,26 @@ inline unsigned int GetSizeOfVarInt(I n)
     return nRet;
 }
 
-template<typename I>
-inline void WriteVarInt(SizeComputer& os, I n);
-
 template<typename Stream, VarIntMode Mode, typename I>
 void WriteVarInt(Stream& os, I n)
 {
-    CheckVarIntMode<Mode, I>();
-    unsigned char tmp[(sizeof(n)*8+6)/7];
-    int len=0;
-    while(true) {
-        tmp[len] = (n & 0x7F) | (len ? 0x80 : 0x00);
-        if (n <= 0x7F)
-            break;
-        n = (n >> 7) - 1;
-        len++;
+    if constexpr (ContainsSizeComputer<Stream>) {
+        os.GetStream().seek(GetSizeOfVarInt<Mode, I>(n));
+    } else {
+        CheckVarIntMode<Mode, I>();
+        unsigned char tmp[(sizeof(n)*8+6)/7];
+        int len=0;
+        while(true) {
+            tmp[len] = (n & 0x7F) | (len ? 0x80 : 0x00);
+            if (n <= 0x7F)
+                break;
+            n = (n >> 7) - 1;
+            len++;
+        }
+        do {
+            ser_writedata8(os, tmp[len]);
+        } while(len--);
     }
-    do {
-        ser_writedata8(os, tmp[len]);
-    } while(len--);
 }
 
 template<typename Stream, VarIntMode Mode, typename I>
@@ -531,7 +572,9 @@ struct CustomUintFormatter
     template <typename Stream, typename I> void Ser(Stream& s, I v)
     {
         if (v < 0 || v > MAX) throw std::ios_base::failure("CustomUintFormatter value out of range");
-        if (BigEndian) {
+        if constexpr (ContainsSizeComputer<Stream>) {
+            s.GetStream().seek(Bytes);
+        } else if (BigEndian) {
             uint64_t raw = htobe64_internal(v);
             s.write(std::as_bytes(std::span{&raw, 1}).template last<Bytes>());
         } else {
@@ -1062,6 +1105,9 @@ protected:
 public:
     SizeComputer() = default;
 
+    SizeComputer& GetStream() { return *this; }
+    const SizeComputer& GetStream() const { return *this; };
+
     void write(std::span<const std::byte> src)
     {
         this->nSize += src.size();
@@ -1085,27 +1131,12 @@ public:
     }
 };
 
-template<typename I>
-inline void WriteVarInt(SizeComputer &s, I n)
-{
-    s.seek(GetSizeOfVarInt<I>(n));
-}
-
-inline void WriteCompactSize(SizeComputer &s, uint64_t nSize)
-{
-    s.seek(GetSizeOfCompactSize(nSize));
-}
-
 template <typename T>
 size_t GetSerializeSize(const T& t)
 {
     return (SizeComputer() << t).size();
 }
 
-//! Check if type contains a stream by seeing if has a GetStream() method.
-template<typename T>
-concept ContainsStream = requires(T t) { t.GetStream(); };
-
 /** Wrapper that overrides the GetParams() function of a stream. */
 template <typename SubStream, typename Params>
 class ParamsStream

From 6269067bc35452b8af89b36607bf982d4c542f2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sun, 9 Mar 2025 21:33:36 +0100
Subject: [PATCH 15/24] optimization: add single byte writes

Single byte writes are used very often (used for every (u)int8_t or std::byte or bool and for every VarInt's first byte which is also needed for every (pre)Vector).
It makes sense to avoid the generalized serialization infrastructure that isn't needed:
* AutoFile write doesn't need to allocate 4k buffer for a single byte now;
* `VectorWriter` and `DataStream` avoids memcpy/insert calls.

> cmake -B build -DBUILD_BENCH=ON -DCMAKE_BUILD_TYPE=Release && cmake --build build -j$(nproc) && build/bin/bench_bitcoin -filter='SizeComputerBlock|SerializeBlock' --min-time=10000

> C compiler ............................ AppleClang 16.0.0.16000026

|            ns/block |             block/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|          174,569.19 |            5,728.39 |    0.6% |     10.89 | `SerializeBlock`
|           10,241.16 |           97,645.21 |    0.0% |     11.00 | `SizeComputerBlock`

> C++ compiler .......................... GNU 13.3.0

|            ns/block |             block/s |    err% |       ins/block |       cyc/block |    IPC |      bra/block |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|          615,000.56 |            1,626.01 |    0.0% |    8,015,883.64 |    2,208,340.88 |  3.630 |   1,517,035.62 |    0.5% |     10.56 | `SerializeBlock`
|           25,676.76 |           38,945.72 |    0.0% |      159,390.03 |       92,202.10 |  1.729 |      42,131.03 |    0.9% |     11.00 | `SizeComputerBlock`
---
 src/bench/checkblock.cpp     |  4 ++--
 src/bench/rpc_blockchain.cpp |  2 +-
 src/crypto/sha256.cpp        | 15 +++++++++++++++
 src/crypto/sha256.h          |  1 +
 src/hash.h                   | 24 +++++++++++++++++++++++-
 src/serialize.h              |  6 ++++++
 src/streams.cpp              | 25 +++++++++++++++++++++++++
 src/streams.h                | 17 +++++++++++++++++
 src/test/crypto_tests.cpp    |  2 +-
 src/test/fuzz/autofile.cpp   |  4 ++--
 10 files changed, 93 insertions(+), 7 deletions(-)

diff --git a/src/bench/checkblock.cpp b/src/bench/checkblock.cpp
index ba26457b31a..2d9eac12339 100644
--- a/src/bench/checkblock.cpp
+++ b/src/bench/checkblock.cpp
@@ -52,7 +52,7 @@ static void DeserializeBlock(benchmark::Bench& bench)
 {
     DataStream stream(benchmark::data::block413567);
     std::byte a{0};
-    stream.write({&a, 1}); // Prevent compaction
+    stream.write(std::span{&a, 1}); // Prevent compaction
 
     bench.unit("block").run([&] {
         CBlock block;
@@ -66,7 +66,7 @@ static void DeserializeAndCheckBlock(benchmark::Bench& bench)
 {
     DataStream stream(benchmark::data::block413567);
     std::byte a{0};
-    stream.write({&a, 1}); // Prevent compaction
+    stream.write(std::span{&a, 1}); // Prevent compaction
 
     ArgsManager bench_args;
     const auto chainParams = CreateChainParams(bench_args, ChainType::MAIN);
diff --git a/src/bench/rpc_blockchain.cpp b/src/bench/rpc_blockchain.cpp
index df951a14e49..a5d28b4a60b 100644
--- a/src/bench/rpc_blockchain.cpp
+++ b/src/bench/rpc_blockchain.cpp
@@ -33,7 +33,7 @@ struct TestBlockAndIndex {
     {
         DataStream stream{benchmark::data::block413567};
         std::byte a{0};
-        stream.write({&a, 1}); // Prevent compaction
+        stream.write(std::span{&a, 1}); // Prevent compaction
 
         stream >> TX_WITH_WITNESS(block);
 
diff --git a/src/crypto/sha256.cpp b/src/crypto/sha256.cpp
index 09c5d3123e8..6cf77849aac 100644
--- a/src/crypto/sha256.cpp
+++ b/src/crypto/sha256.cpp
@@ -723,6 +723,21 @@ CSHA256& CSHA256::Write(const unsigned char* data, size_t len)
     }
     return *this;
 }
+CSHA256& CSHA256::Write(unsigned char data)
+{
+    size_t bufsize = bytes % 64;
+
+    // Add the single byte to the buffer
+    buf[bufsize] = data;
+    bytes += 1;
+
+    if (bufsize == 63) {
+        // Process the buffer if full
+        Transform(s, buf, 1);
+    }
+
+    return *this;
+}
 
 void CSHA256::Finalize(unsigned char hash[OUTPUT_SIZE])
 {
diff --git a/src/crypto/sha256.h b/src/crypto/sha256.h
index b1348631d32..16aae54aa23 100644
--- a/src/crypto/sha256.h
+++ b/src/crypto/sha256.h
@@ -22,6 +22,7 @@ public:
 
     CSHA256();
     CSHA256& Write(const unsigned char* data, size_t len);
+    CSHA256& Write(unsigned char data);
     void Finalize(unsigned char hash[OUTPUT_SIZE]);
     CSHA256& Reset();
 };
diff --git a/src/hash.h b/src/hash.h
index 34486af64a1..da3b1ab1145 100644
--- a/src/hash.h
+++ b/src/hash.h
@@ -38,6 +38,10 @@ public:
         sha.Write(input.data(), input.size());
         return *this;
     }
+    CHash256& Write(std::span<const unsigned char, 1> input) {
+        sha.Write(input[0]);
+        return *this;
+    }
 
     CHash256& Reset() {
         sha.Reset();
@@ -63,6 +67,10 @@ public:
         sha.Write(input.data(), input.size());
         return *this;
     }
+    CHash160& Write(std::span<const unsigned char, 1> input) {
+        sha.Write(input[0]);
+        return *this;
+    }
 
     CHash160& Reset() {
         sha.Reset();
@@ -107,6 +115,10 @@ public:
     {
         ctx.Write(UCharCast(src.data()), src.size());
     }
+    void write(std::span<const std::byte, 1> src)
+    {
+        ctx.Write(*UCharCast(&src[0]));
+    }
 
     /** Compute the double-SHA256 hash of all data written to this object.
      *
@@ -160,13 +172,18 @@ public:
         m_source.read(dst);
         this->write(dst);
     }
+    void read(std::span<std::byte, 1> dst)
+    {
+        m_source.read(dst);
+        this->write(std::span<const std::byte, 1>{dst});
+    }
 
     void ignore(size_t num_bytes)
     {
         std::byte data[1024];
         while (num_bytes > 0) {
             size_t now = std::min<size_t>(num_bytes, 1024);
-            read({data, now});
+            read(std::span{data, now});
             num_bytes -= now;
         }
     }
@@ -194,6 +211,11 @@ public:
         m_source.write(src);
         HashWriter::write(src);
     }
+    void write(std::span<const std::byte, 1> src)
+    {
+        m_source.write(src);
+        HashWriter::write(src);
+    }
 
     template <typename T>
     HashedSourceWriter& operator<<(const T& obj)
diff --git a/src/serialize.h b/src/serialize.h
index 611263ac086..5ec8b24df80 100644
--- a/src/serialize.h
+++ b/src/serialize.h
@@ -1112,6 +1112,10 @@ public:
     {
         this->nSize += src.size();
     }
+    void write(std::span<const std::byte, 1>)
+    {
+        this->nSize += 1;
+    }
 
     /** Pretend _nSize bytes are written, without specifying them. */
     void seek(size_t _nSize)
@@ -1161,7 +1165,9 @@ public:
     template <typename U> ParamsStream& operator<<(const U& obj) { ::Serialize(*this, obj); return *this; }
     template <typename U> ParamsStream& operator>>(U&& obj) { ::Unserialize(*this, obj); return *this; }
     void write(std::span<const std::byte> src) { GetStream().write(src); }
+    void write(std::span<const std::byte, 1> src) { GetStream().write(src); }
     void read(std::span<std::byte> dst) { GetStream().read(dst); }
+    void read(std::span<std::byte, 1> dst) { GetStream().read(dst); }
     void ignore(size_t num) { GetStream().ignore(num); }
     bool eof() const { return GetStream().eof(); }
     size_t size() const { return GetStream().size(); }
diff --git a/src/streams.cpp b/src/streams.cpp
index 2bd142aba8c..60c0e1767db 100644
--- a/src/streams.cpp
+++ b/src/streams.cpp
@@ -62,6 +62,12 @@ void AutoFile::read(std::span<std::byte> dst)
         throw std::ios_base::failure(feof() ? "AutoFile::read: end of file" : "AutoFile::read: fread failed");
     }
 }
+void AutoFile::read(std::span<std::byte, 1> dst)
+{
+    if (detail_fread(dst) != 1) {
+        throw std::ios_base::failure(feof() ? "AutoFile::read: end of file" : "AutoFile::read: fread failed");
+    }
+}
 
 void AutoFile::ignore(size_t nSize)
 {
@@ -95,6 +101,25 @@ void AutoFile::write(std::span<const std::byte> src)
         }
     }
 }
+void AutoFile::write(std::span<const std::byte, 1> src)
+{
+    if (!m_file) throw std::ios_base::failure("AutoFile::write: file handle is nullptr");
+    if (!m_obfuscation) {
+        if (std::fwrite(src.data(), 1, 1, m_file) != 1) {
+            throw std::ios_base::failure("AutoFile::write: write failed");
+        }
+        if (m_position.has_value()) *m_position += 1;
+    } else {
+        if (!m_position.has_value()) throw std::ios_base::failure("AutoFile::write: position unknown");
+        std::byte temp_byte = src[0];
+        std::span val(&temp_byte, 1);
+        m_obfuscation(val, *m_position);
+        if (fwrite(val.data(), 1, 1, m_file) != 1) {
+            throw std::ios_base::failure{"XorFile::write: failed"};
+        }
+        *m_position += 1;
+    }
+}
 
 void AutoFile::write_buffer(std::span<std::byte> src)
 {
diff --git a/src/streams.h b/src/streams.h
index 81f4dc8c78e..5e2bd68af0b 100644
--- a/src/streams.h
+++ b/src/streams.h
@@ -62,6 +62,17 @@ public:
         }
         nPos += src.size();
     }
+    void write(std::span<const std::byte, 1> src)
+    {
+        assert(nPos <= vchData.size());
+        const auto byte{*UCharCast(&src[0])};
+        if (nPos < vchData.size()) {
+            vchData[nPos] = byte;
+        } else {
+            vchData.push_back(byte);
+        }
+        nPos += 1;
+    }
     template <typename T>
     VectorWriter& operator<<(const T& obj)
     {
@@ -233,6 +244,10 @@ public:
         // Write to the end of the buffer
         vch.insert(vch.end(), src.begin(), src.end());
     }
+    void write(std::span<const value_type, 1> src)
+    {
+        vch.push_back(src[0]);
+    }
 
     template<typename T>
     DataStream& operator<<(const T& obj)
@@ -427,8 +442,10 @@ public:
     // Stream subset
     //
     void read(std::span<std::byte> dst);
+    void read(std::span<std::byte, 1> dst);
     void ignore(size_t nSize);
     void write(std::span<const std::byte> src);
+    void write(std::span<const std::byte, 1> src);
 
     template <typename T>
     AutoFile& operator<<(const T& obj)
diff --git a/src/test/crypto_tests.cpp b/src/test/crypto_tests.cpp
index 5588d4cdbc6..0aab9ef0e77 100644
--- a/src/test/crypto_tests.cpp
+++ b/src/test/crypto_tests.cpp
@@ -1079,7 +1079,7 @@ BOOST_AUTO_TEST_CASE(sha256d64)
             in[j] = m_rng.randbits(8);
         }
         for (int j = 0; j < i; ++j) {
-            CHash256().Write({in + 64 * j, 64}).Finalize({out1 + 32 * j, 32});
+            CHash256().Write(std::span{in + 64 * j, 64}).Finalize({out1 + 32 * j, 32});
         }
         SHA256D64(out2, in, i);
         BOOST_CHECK(memcmp(out1, out2, 32 * i) == 0);
diff --git a/src/test/fuzz/autofile.cpp b/src/test/fuzz/autofile.cpp
index c76899cda82..43ad9fab264 100644
--- a/src/test/fuzz/autofile.cpp
+++ b/src/test/fuzz/autofile.cpp
@@ -29,14 +29,14 @@ FUZZ_TARGET(autofile)
             [&] {
                 std::array<std::byte, 4096> arr{};
                 try {
-                    auto_file.read({arr.data(), fuzzed_data_provider.ConsumeIntegralInRange<size_t>(0, 4096)});
+                    auto_file.read(std::span{arr.data(), fuzzed_data_provider.ConsumeIntegralInRange<size_t>(0, 4096)});
                 } catch (const std::ios_base::failure&) {
                 }
             },
             [&] {
                 const std::array<std::byte, 4096> arr{};
                 try {
-                    auto_file.write({arr.data(), fuzzed_data_provider.ConsumeIntegralInRange<size_t>(0, 4096)});
+                    auto_file.write(std::span{arr.data(), fuzzed_data_provider.ConsumeIntegralInRange<size_t>(0, 4096)});
                 } catch (const std::ios_base::failure&) {
                 }
             },

From c15d13075220c3ce3b16485c4c630be00ca403bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sat, 18 Jan 2025 15:37:08 +0100
Subject: [PATCH 16/24] test: validate duplicate detection in
 `CheckTransaction`

The `CheckTransaction` validation function in https://github.com/bitcoin/bitcoin/blob/master/src/consensus/tx_check.cpp#L41-L45 relies on a correct ordering relation for detecting duplicate transaction inputs.

This update to the tests ensures that:
* Accurate detection of duplicates: Beyond trivial cases (e.g., two identical inputs), duplicates are detected correctly in more complex scenarios.
* Consistency across methods: Both sorted sets and hash-based sets behave identically when detecting duplicates for `COutPoint` and related values.
* Robust ordering and equality relations: The function maintains expected behavior for ordering and equality checks.

Using randomized testing with shuffled inputs (to avoid any remaining bias introduced), the enhanced test validates that `CheckTransaction` remains robust and reliable across various input configurations. It confirms identical behavior to a hashing-based duplicate detection mechanism, ensuring consistency and correctness.

To make sure the new branches in the follow-up commits will be covered, `basic_transaction_tests` was extended a randomized test one comparing against the old implementation (and also an alternative duplicate). The iterations and ranges were chosen such that every new branch is expected to be hit once.
---
 src/test/transaction_tests.cpp | 220 +++++++++++++++++++++++++++++++--
 src/test/util/setup_common.cpp |   5 +
 src/test/util/setup_common.h   |   1 +
 3 files changed, 217 insertions(+), 9 deletions(-)

diff --git a/src/test/transaction_tests.cpp b/src/test/transaction_tests.cpp
index 1375672a418..5ebf096a82f 100644
--- a/src/test/transaction_tests.cpp
+++ b/src/test/transaction_tests.cpp
@@ -406,20 +406,110 @@ BOOST_AUTO_TEST_CASE(tx_oversized)
     }
 }
 
-BOOST_AUTO_TEST_CASE(basic_transaction_tests)
+static CMutableTransaction CreateTransaction()
 {
-    // Random real transaction (e2769b09e784f32f62ef849763d4f45b98e07ba658647343b915ff832b110436)
-    unsigned char ch[] = {0x01, 0x00, 0x00, 0x00, 0x01, 0x6b, 0xff, 0x7f, 0xcd, 0x4f, 0x85, 0x65, 0xef, 0x40, 0x6d, 0xd5, 0xd6, 0x3d, 0x4f, 0xf9, 0x4f, 0x31, 0x8f, 0xe8, 0x20, 0x27, 0xfd, 0x4d, 0xc4, 0x51, 0xb0, 0x44, 0x74, 0x01, 0x9f, 0x74, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x8c, 0x49, 0x30, 0x46, 0x02, 0x21, 0x00, 0xda, 0x0d, 0xc6, 0xae, 0xce, 0xfe, 0x1e, 0x06, 0xef, 0xdf, 0x05, 0x77, 0x37, 0x57, 0xde, 0xb1, 0x68, 0x82, 0x09, 0x30, 0xe3, 0xb0, 0xd0, 0x3f, 0x46, 0xf5, 0xfc, 0xf1, 0x50, 0xbf, 0x99, 0x0c, 0x02, 0x21, 0x00, 0xd2, 0x5b, 0x5c, 0x87, 0x04, 0x00, 0x76, 0xe4, 0xf2, 0x53, 0xf8, 0x26, 0x2e, 0x76, 0x3e, 0x2d, 0xd5, 0x1e, 0x7f, 0xf0, 0xbe, 0x15, 0x77, 0x27, 0xc4, 0xbc, 0x42, 0x80, 0x7f, 0x17, 0xbd, 0x39, 0x01, 0x41, 0x04, 0xe6, 0xc2, 0x6e, 0xf6, 0x7d, 0xc6, 0x10, 0xd2, 0xcd, 0x19, 0x24, 0x84, 0x78, 0x9a, 0x6c, 0xf9, 0xae, 0xa9, 0x93, 0x0b, 0x94, 0x4b, 0x7e, 0x2d, 0xb5, 0x34, 0x2b, 0x9d, 0x9e, 0x5b, 0x9f, 0xf7, 0x9a, 0xff, 0x9a, 0x2e, 0xe1, 0x97, 0x8d, 0xd7, 0xfd, 0x01, 0xdf, 0xc5, 0x22, 0xee, 0x02, 0x28, 0x3d, 0x3b, 0x06, 0xa9, 0xd0, 0x3a, 0xcf, 0x80, 0x96, 0x96, 0x8d, 0x7d, 0xbb, 0x0f, 0x91, 0x78, 0xff, 0xff, 0xff, 0xff, 0x02, 0x8b, 0xa7, 0x94, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x19, 0x76, 0xa9, 0x14, 0xba, 0xde, 0xec, 0xfd, 0xef, 0x05, 0x07, 0x24, 0x7f, 0xc8, 0xf7, 0x42, 0x41, 0xd7, 0x3b, 0xc0, 0x39, 0x97, 0x2d, 0x7b, 0x88, 0xac, 0x40, 0x94, 0xa8, 0x02, 0x00, 0x00, 0x00, 0x00, 0x19, 0x76, 0xa9, 0x14, 0xc1, 0x09, 0x32, 0x48, 0x3f, 0xec, 0x93, 0xed, 0x51, 0xf5, 0xfe, 0x95, 0xe7, 0x25, 0x59, 0xf2, 0xcc, 0x70, 0x43, 0xf9, 0x88, 0xac, 0x00, 0x00, 0x00, 0x00, 0x00};
-    std::vector<unsigned char> vch(ch, ch + sizeof(ch) -1);
-    DataStream stream(vch);
+    // Serialized random real transaction (e2769b09e784f32f62ef849763d4f45b98e07ba658647343b915ff832b110436)
+    static constexpr auto ser_tx{"01000000016bff7fcd4f8565ef406dd5d63d4ff94f318fe82027fd4dc451b04474019f74b4000000008c493046022100da0dc6aecefe1e06efdf05773757deb168820930e3b0d03f46f5fcf150bf990c022100d25b5c87040076e4f253f8262e763e2dd51e7ff0be157727c4bc42807f17bd39014104e6c26ef67dc610d2cd192484789a6cf9aea9930b944b7e2db5342b9d9e5b9ff79aff9a2ee1978dd7fd01dfc522ee02283d3b06a9d03acf8096968d7dbb0f9178ffffffff028ba7940e000000001976a914badeecfdef0507247fc8f74241d73bc039972d7b88ac4094a802000000001976a914c10932483fec93ed51f5fe95e72559f2cc7043f988ac0000000000"_hex};
     CMutableTransaction tx;
-    stream >> TX_WITH_WITNESS(tx);
+    DataStream(ser_tx) >> TX_WITH_WITNESS(tx);
+    return tx;
+}
+
+BOOST_AUTO_TEST_CASE(transaction_duplicate_input_test)
+{
+    auto tx{CreateTransaction()};
+
     TxValidationState state;
     BOOST_CHECK_MESSAGE(CheckTransaction(CTransaction(tx), state) && state.IsValid(), "Simple deserialized transaction should be valid.");
 
-    // Check that duplicate txins fail
-    tx.vin.push_back(tx.vin[0]);
-    BOOST_CHECK_MESSAGE(!CheckTransaction(CTransaction(tx), state) || !state.IsValid(), "Transaction with duplicate txins should be invalid.");
+    // Add duplicate input
+    tx.vin.emplace_back(tx.vin[0]);
+    std::ranges::shuffle(tx.vin, m_rng);
+    BOOST_CHECK_MESSAGE(!CheckTransaction(CTransaction(tx), state) || !state.IsValid(), "Transaction with 2 duplicate txins should be invalid.");
+
+    // ... add a valid input for more complex check
+    tx.vin.emplace_back(COutPoint(Txid::FromUint256(uint256{1}), 1));
+    std::ranges::shuffle(tx.vin, m_rng);
+    BOOST_CHECK_MESSAGE(!CheckTransaction(CTransaction(tx), state) || !state.IsValid(), "Transaction with 3 inputs (2 valid, 1 duplicate) should be invalid.");
+}
+
+BOOST_AUTO_TEST_CASE(transaction_duplicate_detection_test)
+{
+    // Randomized testing against hash- and tree-based duplicate check
+    auto reference_duplicate_check_hash{[](const std::vector<CTxIn>& vin) {
+        std::unordered_set<COutPoint, SaltedOutpointHasher> vInOutPoints;
+        for (const auto& txin : vin) {
+            if (!vInOutPoints.insert(txin.prevout).second) {
+                return false;
+            }
+        }
+        return true;
+    }};
+    auto reference_duplicate_check_tree{[](const std::vector<CTxIn>& vin) {
+        std::set<COutPoint> vInOutPoints;
+        for (const auto& txin : vin) {
+            if (!vInOutPoints.insert(txin.prevout).second) {
+                return false;
+            }
+        }
+        return true;
+    }};
+
+    std::vector<Txid> hashes;
+    std::vector<uint32_t> ns;
+    for (int i = 0; i < 10; ++i) {
+        hashes.emplace_back(Txid::FromUint256(m_rng.rand256()));
+        ns.emplace_back(m_rng.rand32());
+    }
+    auto tx{CreateTransaction()};
+    TxValidationState state;
+    for (int i{0}; i < 100; ++i) {
+        if (m_rng.randbool()) {
+            tx.vin.clear();
+        }
+        for (int j{0}, num_inputs{1 + m_rng.randrange(5)}; j < num_inputs; ++j) {
+            if (COutPoint outpoint(hashes[m_rng.randrange(hashes.size())], ns[m_rng.randrange(ns.size())]); !outpoint.IsNull()) {
+                tx.vin.emplace_back(outpoint);
+            }
+        }
+        std::ranges::shuffle(tx.vin, m_rng);
+
+        bool actual{CheckTransaction(CTransaction(tx), state)};
+        BOOST_CHECK_EQUAL(actual, reference_duplicate_check_hash(tx.vin));
+        BOOST_CHECK_EQUAL(actual, reference_duplicate_check_tree(tx.vin));
+    }
+}
+
+BOOST_AUTO_TEST_CASE(transaction_null_prevout_detection_test)
+{
+    // Randomized testing against linear null prevout check
+    auto reference_null_prevout_check_hash{[](const std::vector<CTxIn>& vin) {
+        for (const auto& txin : vin) {
+            if (txin.prevout.IsNull()) {
+                return false;
+            }
+        }
+        return true;
+    }};
+
+    auto tx{CreateTransaction()};
+    TxValidationState state;
+    for (int i{0}; i < 100; ++i) {
+        if (m_rng.randbool()) {
+            tx.vin.clear();
+        }
+        for (int j{0}, num_inputs{1 + m_rng.randrange(5)}; j < num_inputs; ++j) {
+            switch (m_rng.randrange(5)) {
+            case 0: tx.vin.emplace_back(COutPoint()); break;                                               // Null prevout
+            case 1: tx.vin.emplace_back(Txid::FromUint256(uint256::ZERO), m_rng.rand32()); break;          // Null hash, random index
+            case 2: tx.vin.emplace_back(Txid::FromUint256(m_rng.rand256()), COutPoint::NULL_INDEX); break; // Random hash, Null index
+            default: tx.vin.emplace_back(Txid::FromUint256(m_rng.rand256()), m_rng.rand32());              // Random prevout
+            }
+        }
+        std::ranges::shuffle(tx.vin, m_rng);
+
+        BOOST_CHECK_EQUAL(CheckTransaction(CTransaction(tx), state), reference_null_prevout_check_hash(tx.vin));
+    }
 }
 
 BOOST_AUTO_TEST_CASE(test_Get)
@@ -1048,4 +1138,116 @@ BOOST_AUTO_TEST_CASE(test_IsStandard)
     CheckIsNotStandard(t, "dust");
 }
 
+BOOST_AUTO_TEST_CASE(test_uint256_sorting)
+{
+    // Sorting
+    std::vector original{
+        uint256{1},
+        uint256{2},
+        uint256{3}
+    };
+
+    std::vector shuffled{original};
+    std::ranges::shuffle(shuffled, m_rng);
+    std::sort(shuffled.begin(), shuffled.end());
+
+    BOOST_CHECK_EQUAL_COLLECTIONS(original.begin(), original.end(), shuffled.begin(), shuffled.end());
+
+    // Operators
+    constexpr auto a{uint256{1}},
+                   b{uint256{2}},
+                   c{uint256{3}};
+
+    BOOST_CHECK(a == a);
+    BOOST_CHECK(a == uint256{1});
+    BOOST_CHECK(b == b);
+    BOOST_CHECK(c == c);
+    BOOST_CHECK(a != b);
+    BOOST_CHECK(a != uint256{10});
+    BOOST_CHECK(a != c);
+    BOOST_CHECK(b != c);
+
+    BOOST_CHECK(a < b);
+    BOOST_CHECK(a < uint256{10});
+    BOOST_CHECK(b < c);
+    BOOST_CHECK(a < c);
+}
+
+BOOST_AUTO_TEST_CASE(test_transaction_identifier_sorting)
+{
+    std::vector original{
+        Txid::FromUint256(uint256{1}),
+        Txid::FromUint256(uint256{2}),
+        Txid::FromUint256(uint256{3})
+    };
+
+    std::vector shuffled{original};
+    std::ranges::shuffle(shuffled, m_rng);
+    std::sort(shuffled.begin(), shuffled.end());
+
+    BOOST_CHECK_EQUAL_COLLECTIONS(original.begin(), original.end(), shuffled.begin(), shuffled.end());
+
+    // Operators
+    const auto a(Txid::FromUint256(uint256{1})),
+               b(Txid::FromUint256(uint256{2})),
+               c(Txid::FromUint256(uint256{3}));
+
+    BOOST_CHECK(a == uint256{1});
+
+    BOOST_CHECK(a == a);
+    BOOST_CHECK(a == Txid::FromUint256(uint256{1}));
+    BOOST_CHECK(b == b);
+    BOOST_CHECK(c == c);
+    BOOST_CHECK(a != b);
+    BOOST_CHECK(a != Txid::FromUint256(uint256{10}));
+    BOOST_CHECK(a != c);
+    BOOST_CHECK(b != c);
+
+    BOOST_CHECK(a < b);
+    BOOST_CHECK(a < Txid::FromUint256(uint256{10}));
+    BOOST_CHECK(b < c);
+    BOOST_CHECK(a < c);
+}
+
+BOOST_AUTO_TEST_CASE(test_coutpoint_sorting)
+{
+    // Sorting
+    std::vector original{
+        COutPoint(Txid::FromUint256(uint256{1}), 1),
+        COutPoint(Txid::FromUint256(uint256{1}), 2),
+        COutPoint(Txid::FromUint256(uint256{1}), 3),
+        COutPoint(Txid::FromUint256(uint256{2}), 1),
+        COutPoint(Txid::FromUint256(uint256{2}), 2),
+        COutPoint(Txid::FromUint256(uint256{2}), 3),
+        COutPoint(Txid::FromUint256(uint256{3}), 1),
+        COutPoint(Txid::FromUint256(uint256{3}), 2),
+        COutPoint(Txid::FromUint256(uint256{3}), 3)
+    };
+
+    std::vector shuffled{original};
+    std::ranges::shuffle(shuffled, m_rng);
+    std::sort(shuffled.begin(), shuffled.end());
+
+    BOOST_CHECK_EQUAL_COLLECTIONS(original.begin(), original.end(), shuffled.begin(), shuffled.end());
+
+    // Operators
+    const auto a{COutPoint(Txid::FromUint256(uint256{1}), 1)},
+               b{COutPoint(Txid::FromUint256(uint256{1}), 2)},
+               c{COutPoint(Txid::FromUint256(uint256{2}), 1)};
+
+    BOOST_CHECK(a == a);
+    BOOST_CHECK(a == COutPoint(Txid::FromUint256(uint256{1}), 1));
+    BOOST_CHECK(b == b);
+    BOOST_CHECK(c == c);
+    BOOST_CHECK(a != b);
+    BOOST_CHECK(a != COutPoint(Txid::FromUint256(uint256{1}), 10));
+    BOOST_CHECK(a != c);
+    BOOST_CHECK(b != c);
+
+    BOOST_CHECK(a < b);
+    BOOST_CHECK(a < COutPoint(Txid::FromUint256(uint256{1}), 10));
+    BOOST_CHECK(b < c);
+    BOOST_CHECK(a < c);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/test/util/setup_common.cpp b/src/test/util/setup_common.cpp
index 87f0b31cad8..66c36151023 100644
--- a/src/test/util/setup_common.cpp
+++ b/src/test/util/setup_common.cpp
@@ -628,3 +628,8 @@ std::ostream& operator<<(std::ostream& os, const uint256& num)
 {
     return os << num.ToString();
 }
+
+std::ostream& operator<<(std::ostream& os, const COutPoint& outpoint)
+{
+    return os << outpoint.hash << ", " << outpoint.n;
+}
diff --git a/src/test/util/setup_common.h b/src/test/util/setup_common.h
index 57bea9086b9..d10da4a726e 100644
--- a/src/test/util/setup_common.h
+++ b/src/test/util/setup_common.h
@@ -291,6 +291,7 @@ inline std::ostream& operator<<(std::ostream& os, const std::optional<T>& v)
 std::ostream& operator<<(std::ostream& os, const arith_uint256& num);
 std::ostream& operator<<(std::ostream& os, const uint160& num);
 std::ostream& operator<<(std::ostream& os, const uint256& num);
+std::ostream& operator<<(std::ostream& os, const COutPoint& outpoint);
 // @}
 
 /**

From bf580d7b4e6376c69dc2a7cabe25f2223b2ffb04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sat, 18 Jan 2025 11:43:26 +0100
Subject: [PATCH 17/24] bench: measure `CheckBlock` speed separately from
 serialization

> cmake -B build -DBUILD_BENCH=ON -DCMAKE_BUILD_TYPE=Release && cmake --build build -j$(nproc) && build/src/bench/bench_bitcoin -filter='CheckBlockBench|DuplicateInputs' -min-time=10000

> C++ compiler .......................... AppleClang 16.0.0.16000026

|            ns/block |             block/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|          372,743.63 |            2,682.81 |    1.1% |     10.99 | `CheckBlockBench`

|               ns/op |                op/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|        3,304,694.54 |              302.60 |    0.5% |     11.05 | `DuplicateInputs`

> C++ compiler .......................... GNU 13.3.0

|            ns/block |             block/s |    err% |       ins/block |       cyc/block |    IPC |      bra/block |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|        1,096,261.84 |              912.19 |    0.1% |    7,963,390.88 |    3,487,375.26 |  2.283 |   1,266,941.00 |    1.8% |     11.03 | `CheckBlockBench`

|               ns/op |                op/s |    err% |          ins/op |          cyc/op |    IPC |         bra/op |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|        8,366,309.48 |              119.53 |    0.0% |   23,865,177.67 |   26,620,160.23 |  0.897 |   5,972,887.41 |    4.0% |     10.78 | `DuplicateInputs`
---
 src/bench/checkblock.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/bench/checkblock.cpp b/src/bench/checkblock.cpp
index 2d9eac12339..c5ddebfd0ad 100644
--- a/src/bench/checkblock.cpp
+++ b/src/bench/checkblock.cpp
@@ -21,7 +21,7 @@
 #include <optional>
 #include <vector>
 
-static void SizeComputerBlock(benchmark::Bench& bench) {
+static void SizeComputerBlockBench(benchmark::Bench& bench) {
     CBlock block;
     DataStream(benchmark::data::block413567) >> TX_WITH_WITNESS(block);
 
@@ -32,7 +32,7 @@ static void SizeComputerBlock(benchmark::Bench& bench) {
     });
 }
 
-static void SerializeBlock(benchmark::Bench& bench) {
+static void SerializeBlockBench(benchmark::Bench& bench) {
     CBlock block;
     DataStream(benchmark::data::block413567) >> TX_WITH_WITNESS(block);
 
@@ -48,7 +48,7 @@ static void SerializeBlock(benchmark::Bench& bench) {
 // a block off the wire, but before we can relay the block on to peers using
 // compact block relay.
 
-static void DeserializeBlock(benchmark::Bench& bench)
+static void DeserializeBlockBench(benchmark::Bench& bench)
 {
     DataStream stream(benchmark::data::block413567);
     std::byte a{0};
@@ -83,7 +83,7 @@ static void DeserializeAndCheckBlock(benchmark::Bench& bench)
     });
 }
 
-BENCHMARK(SizeComputerBlock, benchmark::PriorityLevel::HIGH);
-BENCHMARK(SerializeBlock, benchmark::PriorityLevel::HIGH);
-BENCHMARK(DeserializeBlock, benchmark::PriorityLevel::HIGH);
+BENCHMARK(SizeComputerBlockBench, benchmark::PriorityLevel::HIGH);
+BENCHMARK(SerializeBlockBench, benchmark::PriorityLevel::HIGH);
+BENCHMARK(DeserializeBlockBench, benchmark::PriorityLevel::HIGH);
 BENCHMARK(DeserializeAndCheckBlock, benchmark::PriorityLevel::HIGH);

From a300325c5ba30a3a4d709c93fa879afa8a7354f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Tue, 21 Jan 2025 14:01:31 +0100
Subject: [PATCH 18/24] bench: add `ProcessTransactionBench` to measure
 `CheckBlock` in context

The newly introduced `ProcessTransactionBench` incorporates multiple steps in the validation pipeline, offering a more comprehensive view of `CheckBlock` performance within a realistic transaction validation context.

Previous microbenchmarks, such as DeserializeAndCheckBlockTest and DuplicateInputs, focused on isolated aspects of transaction and block validation. While these tests provided valuable insights for targeted profiling, they lacked context regarding the broader validation process, where interactions between components play a critical role.

> cmake -B build -DBUILD_BENCH=ON -DCMAKE_BUILD_TYPE=Release && cmake --build build -j$(nproc) && build/src/bench/bench_bitcoin -filter='ProcessTransactionBench' -min-time=10000

> C++ compiler .......................... AppleClang 16.0.0.16000026

|               ns/op |                op/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|            9,585.10 |          104,328.55 |    0.1% |     11.03 | `ProcessTransactionBench`

> C++ compiler .......................... GNU 13.3.0

|               ns/op |                op/s |    err% |          ins/op |          cyc/op |    IPC |         bra/op |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|           56,199.57 |           17,793.73 |    0.1% |      229,263.01 |      178,766.31 |  1.282 |      15,509.97 |    0.5% |     10.91 | `ProcessTransactionBench`
---
 src/bench/mempool_stress.cpp | 57 ++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/src/bench/mempool_stress.cpp b/src/bench/mempool_stress.cpp
index fbac25db5f5..2fe0622d18b 100644
--- a/src/bench/mempool_stress.cpp
+++ b/src/bench/mempool_stress.cpp
@@ -13,10 +13,19 @@
 #include <test/util/txmempool.h>
 #include <txmempool.h>
 #include <validation.h>
+#include <bench/data/block413567.raw.h>
+#include <node/context.h>
+#include <node/miner.h>
+#include <primitives/block.h>
+#include <test/util/script.h>
+#include <util/check.h>
 
+#include <array>
+#include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <streams.h>
 #include <vector>
 
 class CCoinsViewCache;
@@ -126,5 +135,53 @@ static void MempoolCheck(benchmark::Bench& bench)
     });
 }
 
+static void ProcessTransactionBench(benchmark::Bench& bench)
+{
+    const auto testing_setup{MakeNoLogFileContext<const TestingSetup>()};
+    CTxMemPool& pool{*Assert(testing_setup->m_node.mempool)};
+    ChainstateManager& chainman{*testing_setup->m_node.chainman};
+
+    CBlock block;
+    DataStream(benchmark::data::block413567) >> TX_WITH_WITNESS(block);
+
+    std::vector<CTransactionRef> txs(block.vtx.size() - 1);
+    for (size_t i{1}; i < block.vtx.size(); ++i) {
+        CMutableTransaction mtx{*block.vtx[i]};
+        for (auto& txin : mtx.vin) {
+            txin.nSequence = CTxIn::SEQUENCE_FINAL;
+            txin.scriptSig.clear();
+            txin.scriptWitness.stack = {WITNESS_STACK_ELEM_OP_TRUE};
+        }
+        txs[i - 1] = MakeTransactionRef(std::move(mtx));
+    }
+
+    CCoinsViewCache* coins_tip{nullptr};
+    size_t cached_coin_count{0};
+    {
+        LOCK(cs_main);
+        coins_tip = &chainman.ActiveChainstate().CoinsTip();
+        for (const auto& tx : txs) {
+            const Coin coin(CTxOut(2 * tx->GetValueOut(), P2WSH_OP_TRUE), 1, /*fCoinBaseIn=*/false);
+            for (const auto& in : tx->vin) {
+                coins_tip->AddCoin(in.prevout, Coin{coin}, /*possible_overwrite=*/false);
+                cached_coin_count++;
+            }
+        }
+    }
+
+    bench.batch(txs.size()).run([&] {
+        LOCK2(cs_main, pool.cs);
+        assert(coins_tip->GetCacheSize() == cached_coin_count);
+        for (const auto& tx : txs) pool.removeRecursive(*tx, MemPoolRemovalReason::REPLACED);
+        assert(pool.size() == 0);
+
+        for (const auto& tx : txs) {
+            const auto res{chainman.ProcessTransaction(tx, /*test_accept=*/true)};
+            assert(res.m_result_type == MempoolAcceptResult::ResultType::VALID);
+        }
+    });
+}
+
 BENCHMARK(ComplexMemPool, benchmark::PriorityLevel::HIGH);
 BENCHMARK(MempoolCheck, benchmark::PriorityLevel::HIGH);
+BENCHMARK(ProcessTransactionBench, benchmark::PriorityLevel::HIGH);

From 7b576a440d675dd29525efd6b3254552ce049834 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sat, 25 Jan 2025 14:07:24 +0100
Subject: [PATCH 19/24] optimization: move duplicate checks outside of coinbase
 branch

`IsCoinBase` means single input with NULL prevout, so it makes sense to restrict duplicate check to non-coinbase transactions only.
The behavior is the same as before, except that single-input-transactions aren't checked for duplicates anymore (~70-90% of the cases, see https://transactionfee.info/charts/transactions-1in).
I've added braces to the conditions and loops to simplify review of followup commits.

> cmake -B build -DBUILD_BENCH=ON -DCMAKE_BUILD_TYPE=Release && cmake --build build -j$(nproc) && build/src/bench/bench_bitcoin -filter='CheckBlockBench|DuplicateInputs|ProcessTransactionBench' -min-time=10000

> C++ compiler .......................... AppleClang 16.0.0.16000026

|            ns/block |             block/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|          335,917.12 |            2,976.92 |    1.3% |     11.01 | `CheckBlockBench`

|               ns/op |                op/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|        3,286,337.42 |              304.29 |    1.1% |     10.90 | `DuplicateInputs`
|            9,561.02 |          104,591.35 |    0.2% |     11.02 | `ProcessTransactionBench`
---
 src/consensus/tx_check.cpp | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/src/consensus/tx_check.cpp b/src/consensus/tx_check.cpp
index b3fee1e8b1a..050bca84d47 100644
--- a/src/consensus/tx_check.cpp
+++ b/src/consensus/tx_check.cpp
@@ -38,22 +38,25 @@ bool CheckTransaction(const CTransaction& tx, TxValidationState& state)
     // of a tx as spent, it does not check if the tx has duplicate inputs.
     // Failure to run this check will result in either a crash or an inflation bug, depending on the implementation of
     // the underlying coins database.
-    std::set<COutPoint> vInOutPoints;
-    for (const auto& txin : tx.vin) {
-        if (!vInOutPoints.insert(txin.prevout).second)
-            return state.Invalid(TxValidationResult::TX_CONSENSUS, "bad-txns-inputs-duplicate");
-    }
+    if (tx.vin.size() == 1) {
+        if (tx.IsCoinBase()) {
+            if (tx.vin[0].scriptSig.size() < 2 || tx.vin[0].scriptSig.size() > 100) {
+                return state.Invalid(TxValidationResult::TX_CONSENSUS, "bad-cb-length");
+            }
+        }
+    } else {
+        std::set<COutPoint> vInOutPoints;
+        for (const auto& txin : tx.vin) {
+            if (!vInOutPoints.insert(txin.prevout).second) {
+                return state.Invalid(TxValidationResult::TX_CONSENSUS, "bad-txns-inputs-duplicate");
+            }
+        }
 
-    if (tx.IsCoinBase())
-    {
-        if (tx.vin[0].scriptSig.size() < 2 || tx.vin[0].scriptSig.size() > 100)
-            return state.Invalid(TxValidationResult::TX_CONSENSUS, "bad-cb-length");
-    }
-    else
-    {
-        for (const auto& txin : tx.vin)
-            if (txin.prevout.IsNull())
+        for (const auto& txin : tx.vin) {
+            if (txin.prevout.IsNull()) {
                 return state.Invalid(TxValidationResult::TX_CONSENSUS, "bad-txns-prevout-null");
+            }
+        }
     }
 
     return true;

From 765b71b90b0333c3531f313781edd7ad77abebdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Fri, 17 Jan 2025 22:39:59 +0100
Subject: [PATCH 20/24] optimization: simplify duplicate checks for trivial
 inputs

No need to create a set for checking duplicates for two-input-transactions.

> cmake -B build -DBUILD_BENCH=ON -DCMAKE_BUILD_TYPE=Release && cmake --build build -j$(nproc) && build/src/bench/bench_bitcoin -filter='CheckBlockBench|DuplicateInputs|ProcessTransactionBench' -min-time=10000

> C++ compiler .......................... AppleClang 16.0.0.16000026

|            ns/block |             block/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|          314,137.30 |            3,183.32 |    1.2% |     11.04 | `CheckBlockBench`

|               ns/op |                op/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|        3,220,592.73 |              310.50 |    1.3% |     10.92 | `DuplicateInputs`
|            9,425.98 |          106,089.77 |    0.3% |     11.00 | `ProcessTransactionBench`
---
 src/consensus/tx_check.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/consensus/tx_check.cpp b/src/consensus/tx_check.cpp
index 050bca84d47..cd465e56d44 100644
--- a/src/consensus/tx_check.cpp
+++ b/src/consensus/tx_check.cpp
@@ -45,11 +45,17 @@ bool CheckTransaction(const CTransaction& tx, TxValidationState& state)
             }
         }
     } else {
-        std::set<COutPoint> vInOutPoints;
-        for (const auto& txin : tx.vin) {
-            if (!vInOutPoints.insert(txin.prevout).second) {
+        if (tx.vin.size() == 2) {
+            if (tx.vin[0].prevout == tx.vin[1].prevout) {
                 return state.Invalid(TxValidationResult::TX_CONSENSUS, "bad-txns-inputs-duplicate");
             }
+        } else {
+            std::set<COutPoint> vInOutPoints;
+            for (const auto& txin : tx.vin) {
+                if (!vInOutPoints.insert(txin.prevout).second) {
+                    return state.Invalid(TxValidationResult::TX_CONSENSUS, "bad-txns-inputs-duplicate");
+                }
+            }
         }
 
         for (const auto& txin : tx.vin) {

From cb8c012b87ac45cf17dafe13914767c0dfd9071c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sat, 18 Jan 2025 11:54:47 +0100
Subject: [PATCH 21/24] optimization: replace tree with sorted vector

A pre-sized vector retains locality (enabling SIMD operations), speeding up sorting and equality checks.
It's also simpler (therefore more reliable) than a sorted set. It also causes less memory fragmentation.

> cmake -B build -DBUILD_BENCH=ON -DCMAKE_BUILD_TYPE=Release && cmake --build build -j$(nproc) && build/src/bench/bench_bitcoin -filter='CheckBlockBench|DuplicateInputs|ProcessTransactionBench' -min-time=10000

> C++ compiler .......................... AppleClang 16.0.0.16000026

|            ns/block |             block/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|          181,922.54 |            5,496.85 |    0.2% |     10.98 | `CheckBlockBench`

|               ns/op |                op/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|          997,739.30 |            1,002.27 |    1.0% |     10.94 | `DuplicateInputs`
|            9,449.28 |          105,828.15 |    0.3% |     10.99 | `ProcessTransactionBench`

Co-authored-by: Pieter Wuille <pieter@wuille.net>
---
 src/consensus/tx_check.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/consensus/tx_check.cpp b/src/consensus/tx_check.cpp
index cd465e56d44..9457596fbb9 100644
--- a/src/consensus/tx_check.cpp
+++ b/src/consensus/tx_check.cpp
@@ -50,11 +50,14 @@ bool CheckTransaction(const CTransaction& tx, TxValidationState& state)
                 return state.Invalid(TxValidationResult::TX_CONSENSUS, "bad-txns-inputs-duplicate");
             }
         } else {
-            std::set<COutPoint> vInOutPoints;
+            std::vector<COutPoint> sortedPrevouts;
+            sortedPrevouts.reserve(tx.vin.size());
             for (const auto& txin : tx.vin) {
-                if (!vInOutPoints.insert(txin.prevout).second) {
-                    return state.Invalid(TxValidationResult::TX_CONSENSUS, "bad-txns-inputs-duplicate");
-                }
+                sortedPrevouts.push_back(txin.prevout);
+            }
+            std::sort(sortedPrevouts.begin(), sortedPrevouts.end());
+            if (std::ranges::adjacent_find(sortedPrevouts) != sortedPrevouts.end()) {
+                return state.Invalid(TxValidationResult::TX_CONSENSUS, "bad-txns-inputs-duplicate");
             }
         }
 

From f07c79f034f72cb7d0885444854f162dc5c2df55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sat, 25 Jan 2025 14:24:02 +0100
Subject: [PATCH 22/24] optimization: look for NULL prevouts in the sorted
 values

For the 2 input case we simply check them both, like we did with equality.

For the general case, we take advantage of sorting, making invalid value detection constant time instead of linear in the worst case.

> cmake -B build -DBUILD_BENCH=ON -DCMAKE_BUILD_TYPE=Release && cmake --build build -j$(nproc) && build/src/bench/bench_bitcoin -filter='CheckBlockBench|DuplicateInputs|ProcessTransactionBench' -min-time=10000

> C++ compiler .......................... AppleClang 16.0.0.16000026

|            ns/block |             block/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|          179,971.00 |            5,556.45 |    0.3% |     11.02 | `CheckBlockBench`

|               ns/op |                op/s |    err% |     total | benchmark
|--------------------:|--------------------:|--------:|----------:|:----------
|          963,177.98 |            1,038.23 |    1.7% |     10.92 | `DuplicateInputs`
|            9,410.90 |          106,259.75 |    0.3% |     11.01 | `ProcessTransactionBench`

> C++ compiler .......................... GNU 13.3.0

|            ns/block |             block/s |    err% |       ins/block |       cyc/block |    IPC |      bra/block |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|          834,855.94 |            1,197.81 |    0.0% |    6,518,548.86 |    2,656,039.78 |  2.454 |     919,160.84 |    1.5% |     10.78 | `CheckBlockBench`

|               ns/op |                op/s |    err% |          ins/op |          cyc/op |    IPC |         bra/op |   miss% |     total | benchmark
|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
|        4,261,492.75 |              234.66 |    0.0% |   17,379,823.40 |   13,559,793.33 |  1.282 |   4,265,714.28 |    3.4% |     11.00 | `DuplicateInputs`
|           55,819.53 |           17,914.88 |    0.1% |      227,828.15 |      177,520.09 |  1.283 |      15,184.36 |    0.4% |     10.91 | `ProcessTransactionBench`
---
 src/consensus/tx_check.cpp | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/src/consensus/tx_check.cpp b/src/consensus/tx_check.cpp
index 9457596fbb9..0a471deda24 100644
--- a/src/consensus/tx_check.cpp
+++ b/src/consensus/tx_check.cpp
@@ -44,25 +44,27 @@ bool CheckTransaction(const CTransaction& tx, TxValidationState& state)
                 return state.Invalid(TxValidationResult::TX_CONSENSUS, "bad-cb-length");
             }
         }
+    } else if (tx.vin.size() == 2) {
+        if (tx.vin[0].prevout == tx.vin[1].prevout) {
+            return state.Invalid(TxValidationResult::TX_CONSENSUS, "bad-txns-inputs-duplicate");
+        }
+        if (tx.vin[0].prevout.IsNull() || tx.vin[1].prevout.IsNull()) {
+            return state.Invalid(TxValidationResult::TX_CONSENSUS, "bad-txns-prevout-null");
+        }
     } else {
-        if (tx.vin.size() == 2) {
-            if (tx.vin[0].prevout == tx.vin[1].prevout) {
-                return state.Invalid(TxValidationResult::TX_CONSENSUS, "bad-txns-inputs-duplicate");
-            }
-        } else {
-            std::vector<COutPoint> sortedPrevouts;
-            sortedPrevouts.reserve(tx.vin.size());
-            for (const auto& txin : tx.vin) {
-                sortedPrevouts.push_back(txin.prevout);
-            }
-            std::sort(sortedPrevouts.begin(), sortedPrevouts.end());
-            if (std::ranges::adjacent_find(sortedPrevouts) != sortedPrevouts.end()) {
-                return state.Invalid(TxValidationResult::TX_CONSENSUS, "bad-txns-inputs-duplicate");
-            }
+        std::vector<COutPoint> sortedPrevouts;
+        sortedPrevouts.reserve(tx.vin.size());
+        for (const auto& txin : tx.vin) {
+            sortedPrevouts.push_back(txin.prevout);
+        }
+        std::sort(sortedPrevouts.begin(), sortedPrevouts.end());
+        if (std::ranges::adjacent_find(sortedPrevouts) != sortedPrevouts.end()) {
+            return state.Invalid(TxValidationResult::TX_CONSENSUS, "bad-txns-inputs-duplicate");
         }
 
-        for (const auto& txin : tx.vin) {
-            if (txin.prevout.IsNull()) {
+        for (const auto& in : sortedPrevouts) {
+            if (!in.hash.IsNull()) break; // invalid values can only be at the beginning
+            if (in.IsNull()) {
                 return state.Invalid(TxValidationResult::TX_CONSENSUS, "bad-txns-prevout-null");
             }
         }

From b5dc42874d20a23429293d9037a1fda99e2282c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Tue, 15 Apr 2025 12:31:20 +0200
Subject: [PATCH 23/24] test: assert CScript allocation characteristics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Verifies that script types are correctly allocated using prevector's direct (stack) or indirect (heap) storage based on their size:

Direct (stack) allocated script types (size ≤ 28 bytes):
* OP_RETURN (small)
* P2WPKH
* P2SH
* P2PKH

Indirect (heap) allocated script types (size > 28 bytes):
* P2WSH
* P2TR
* P2PK
* MULTISIG (small)

This test provides a baseline for verifying changes to prevector's inline capacity.
---
 src/test/script_tests.cpp | 85 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)

diff --git a/src/test/script_tests.cpp b/src/test/script_tests.cpp
index 4a5dc561cc3..700a0fd3427 100644
--- a/src/test/script_tests.cpp
+++ b/src/test/script_tests.cpp
@@ -1129,6 +1129,91 @@ BOOST_AUTO_TEST_CASE(script_CHECKMULTISIG23)
     BOOST_CHECK_MESSAGE(err == SCRIPT_ERR_INVALID_STACK_OPERATION, ScriptErrorString(err));
 }
 
+BOOST_AUTO_TEST_CASE(script_size_and_capacity_test)
+{
+    BOOST_CHECK_EQUAL(sizeof(prevector<28, unsigned char>), 32);
+    BOOST_CHECK_EQUAL(sizeof(CScriptBase), 32);
+    BOOST_CHECK_EQUAL(sizeof(CScript), 32);
+    BOOST_CHECK_EQUAL(sizeof(CTxOut), 40);
+
+    CKey dummyKey;
+    dummyKey.MakeNewKey(true);
+
+    std::vector<std::vector<uint8_t>> dummyVSolutions;
+
+    // Small OP_RETURN is stack allocated
+    {
+        const auto scriptSmallOpReturn{CScript() << OP_RETURN << std::vector<uint8_t>(10, 0xaa)};
+        BOOST_CHECK_EQUAL(Solver(scriptSmallOpReturn, dummyVSolutions), TxoutType::NULL_DATA);
+        BOOST_CHECK_EQUAL(scriptSmallOpReturn.size(), 12);
+        BOOST_CHECK_EQUAL(scriptSmallOpReturn.capacity(), 28);
+        BOOST_CHECK_EQUAL(scriptSmallOpReturn.allocated_memory(), 0);
+    }
+
+    // P2WPKH is stack allocated
+    {
+        const auto scriptP2WPKH{GetScriptForDestination(WitnessV0KeyHash{PKHash{CKeyID{CPubKey{dummyKey.GetPubKey()}.GetID()}}})};
+        BOOST_CHECK_EQUAL(Solver(scriptP2WPKH, dummyVSolutions), TxoutType::WITNESS_V0_KEYHASH);
+        BOOST_CHECK_EQUAL(scriptP2WPKH.size(), 22);
+        BOOST_CHECK_EQUAL(scriptP2WPKH.capacity(), 28);
+        BOOST_CHECK_EQUAL(scriptP2WPKH.allocated_memory(), 0);
+    }
+
+    // P2SH is stack allocated
+    {
+        const auto scriptP2SH{GetScriptForDestination(ScriptHash{CScript{} << OP_TRUE})};
+        BOOST_CHECK(scriptP2SH.IsPayToScriptHash());
+        BOOST_CHECK_EQUAL(scriptP2SH.size(), 23);
+        BOOST_CHECK_EQUAL(scriptP2SH.capacity(), 28);
+        BOOST_CHECK_EQUAL(scriptP2SH.allocated_memory(), 0);
+    }
+
+    // P2PKH is stack allocated
+    {
+        const auto scriptP2PKH{GetScriptForDestination(PKHash{CKeyID{CPubKey{dummyKey.GetPubKey()}.GetID()}})};
+        BOOST_CHECK_EQUAL(Solver(scriptP2PKH, dummyVSolutions), TxoutType::PUBKEYHASH);
+        BOOST_CHECK_EQUAL(scriptP2PKH.size(), 25);
+        BOOST_CHECK_EQUAL(scriptP2PKH.capacity(), 28);
+        BOOST_CHECK_EQUAL(scriptP2PKH.allocated_memory(), 0);
+    }
+
+    // P2WSH is heap allocated
+    {
+        const auto scriptP2WSH{GetScriptForDestination(WitnessV0ScriptHash{CScript{} << OP_TRUE})};
+        BOOST_CHECK(scriptP2WSH.IsPayToWitnessScriptHash());
+        BOOST_CHECK_EQUAL(scriptP2WSH.size(), 34);
+        BOOST_CHECK_EQUAL(scriptP2WSH.capacity(), 34);
+        BOOST_CHECK_EQUAL(scriptP2WSH.allocated_memory(), 34);
+    }
+
+    // P2TR is heap allocated
+    {
+        const auto scriptTaproot{GetScriptForDestination(WitnessV1Taproot{XOnlyPubKey{CPubKey{dummyKey.GetPubKey()}}})};
+        BOOST_CHECK_EQUAL(Solver(scriptTaproot, dummyVSolutions), TxoutType::WITNESS_V1_TAPROOT);
+        BOOST_CHECK_EQUAL(scriptTaproot.size(), 34);
+        BOOST_CHECK_EQUAL(scriptTaproot.capacity(), 34);
+        BOOST_CHECK_EQUAL(scriptTaproot.allocated_memory(), 34);
+    }
+
+    // P2PK is heap allocated
+    {
+        const auto scriptPubKey{GetScriptForRawPubKey(CPubKey{dummyKey.GetPubKey()})};
+        BOOST_CHECK_EQUAL(Solver(scriptPubKey, dummyVSolutions), TxoutType::PUBKEY);
+        BOOST_CHECK_EQUAL(scriptPubKey.size(), 35);
+        BOOST_CHECK_EQUAL(scriptPubKey.capacity(), 35);
+        BOOST_CHECK_EQUAL(scriptPubKey.allocated_memory(), 35);
+    }
+
+    // MULTISIG is always heap allocated
+    {
+        const auto scriptMultisig{GetScriptForMultisig(1, std::vector{2, CPubKey{dummyKey.GetPubKey()}})};
+        BOOST_CHECK_EQUAL(Solver(scriptMultisig, dummyVSolutions), TxoutType::MULTISIG);
+        BOOST_CHECK_EQUAL(scriptMultisig.size(), 71);
+        BOOST_CHECK_EQUAL(scriptMultisig.capacity(), 103);
+        BOOST_CHECK_EQUAL(scriptMultisig.allocated_memory(), 103);
+    }
+}
+
 /* Wrapper around ProduceSignature to combine two scriptsigs */
 SignatureData CombineSignatures(const CTxOut& txout, const CMutableTransaction& tx, const SignatureData& scriptSig1, const SignatureData& scriptSig2)
 {

From b6b4235c143eda6f0fa4ed86f2dcd47709b3b6ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Thu, 27 Mar 2025 18:40:08 +0100
Subject: [PATCH 24/24] Allocate `P2WSH`/`P2TR`/`P2PK` scripts on stack

The current `prevector` size of 28 bytes (chosen to fill the `sizeof(CScript)` aligned size) was introduced in 2015 (https://github.com/bitcoin/bitcoin/pull/6914) before SegWit and TapRoot.
However, the increasingly common `P2WSH` and `P2TR` scripts are both 34 bytes, and are forced to use heap (re)allocation rather than efficient inline storage.

The core trade-off of this change is to eliminate heap allocations for common 34-36 byte scripts at the cost of increasing the base memory footprint of all `CScript` objects by 8 bytes (while still respecting peak memory usage defined by `-dbcache`).

Increasing the `prevector` size allows these scripts to be stored on the stack, avoiding heap allocations, reducing potential memory fragmentation, and improving performance during cache flushes. Massif analysis confirms a lower stable memory usage after flushing, suggesting the elimination of heap allocations outweighs the larger base size for common workloads.

Due to memory alignment, increasing the `prevector` size to 36 bytes doesn't change the overall `sizeof(CScript)` compared to an increase to 34 bytes, allowing us to include `P2PK` scripts as well at no additional memory cost.

Performance benchmarks for AssumeUTXO load and flush show:
- Small dbcache (450MB): ~1% performance penalty due to more frequent flushes
- Large dbcache (4500-4500MB+): ~6-7% performance improvement due to fewer heap allocations

Full IBD and reindex-chainstate with larger `dbcache` values also show an overall ~3% speedup.

Co-authored-by: Ava Chow <github@achow101.com>
Co-authored-by: Andrew Toth <andrewstoth@gmail.com>
---
 src/bench/checkqueue.cpp            |  2 +-
 src/bench/prevector.cpp             | 36 ++++++++++++++---------------
 src/script/script.h                 |  2 +-
 src/test/script_tests.cpp           | 34 +++++++++++++--------------
 src/test/validation_flush_tests.cpp |  6 ++---
 5 files changed, 39 insertions(+), 41 deletions(-)

diff --git a/src/bench/checkqueue.cpp b/src/bench/checkqueue.cpp
index 8134154eb11..72eaa6588ad 100644
--- a/src/bench/checkqueue.cpp
+++ b/src/bench/checkqueue.cpp
@@ -16,7 +16,7 @@
 
 static const size_t BATCHES = 101;
 static const size_t BATCH_SIZE = 30;
-static const int PREVECTOR_SIZE = 28;
+static const int PREVECTOR_SIZE = 36;
 static const unsigned int QUEUE_BATCH_SIZE = 128;
 
 // This Benchmark tests the CheckQueue with a slightly realistic workload,
diff --git a/src/bench/prevector.cpp b/src/bench/prevector.cpp
index 60c43e86e1b..97e897519d8 100644
--- a/src/bench/prevector.cpp
+++ b/src/bench/prevector.cpp
@@ -27,22 +27,22 @@ template <typename T>
 static void PrevectorDestructor(benchmark::Bench& bench)
 {
     bench.batch(2).run([&] {
-        prevector<28, T> t0;
-        prevector<28, T> t1;
-        t0.resize(28);
-        t1.resize(29);
+        prevector<36, T> t0;
+        prevector<36, T> t1;
+        t0.resize(36);
+        t1.resize(37);
     });
 }
 
 template <typename T>
 static void PrevectorClear(benchmark::Bench& bench)
 {
-    prevector<28, T> t0;
-    prevector<28, T> t1;
+    prevector<36, T> t0;
+    prevector<36, T> t1;
     bench.batch(2).run([&] {
-        t0.resize(28);
+        t0.resize(36);
         t0.clear();
-        t1.resize(29);
+        t1.resize(37);
         t1.clear();
     });
 }
@@ -50,12 +50,12 @@ static void PrevectorClear(benchmark::Bench& bench)
 template <typename T>
 static void PrevectorResize(benchmark::Bench& bench)
 {
-    prevector<28, T> t0;
-    prevector<28, T> t1;
+    prevector<36, T> t0;
+    prevector<36, T> t1;
     bench.batch(4).run([&] {
-        t0.resize(28);
+        t0.resize(36);
         t0.resize(0);
-        t1.resize(29);
+        t1.resize(37);
         t1.resize(0);
     });
 }
@@ -64,8 +64,8 @@ template <typename T>
 static void PrevectorDeserialize(benchmark::Bench& bench)
 {
     DataStream s0{};
-    prevector<28, T> t0;
-    t0.resize(28);
+    prevector<36, T> t0;
+    t0.resize(36);
     for (auto x = 0; x < 900; ++x) {
         s0 << t0;
     }
@@ -74,7 +74,7 @@ static void PrevectorDeserialize(benchmark::Bench& bench)
         s0 << t0;
     }
     bench.batch(1000).run([&] {
-        prevector<28, T> t1;
+        prevector<36, T> t1;
         for (auto x = 0; x < 1000; ++x) {
             s0 >> t1;
         }
@@ -86,7 +86,7 @@ template <typename T>
 static void PrevectorFillVectorDirect(benchmark::Bench& bench)
 {
     bench.run([&] {
-        std::vector<prevector<28, T>> vec;
+        std::vector<prevector<36, T>> vec;
         vec.reserve(260);
         for (size_t i = 0; i < 260; ++i) {
             vec.emplace_back();
@@ -99,11 +99,11 @@ template <typename T>
 static void PrevectorFillVectorIndirect(benchmark::Bench& bench)
 {
     bench.run([&] {
-        std::vector<prevector<28, T>> vec;
+        std::vector<prevector<36, T>> vec;
         vec.reserve(260);
         for (size_t i = 0; i < 260; ++i) {
             // force allocation
-            vec.emplace_back(29, T{});
+            vec.emplace_back(37, T{});
         }
     });
 }
diff --git a/src/script/script.h b/src/script/script.h
index f4579849803..9c6d25eb157 100644
--- a/src/script/script.h
+++ b/src/script/script.h
@@ -406,7 +406,7 @@ private:
  * Tests in October 2015 showed use of this reduced dbcache memory usage by 23%
  *  and made an initial sync 13% faster.
  */
-typedef prevector<28, unsigned char> CScriptBase;
+typedef prevector<36, unsigned char> CScriptBase;
 
 bool GetScriptOp(CScriptBase::const_iterator& pc, CScriptBase::const_iterator end, opcodetype& opcodeRet, std::vector<unsigned char>* pvchRet);
 
diff --git a/src/test/script_tests.cpp b/src/test/script_tests.cpp
index 700a0fd3427..f240fe8949b 100644
--- a/src/test/script_tests.cpp
+++ b/src/test/script_tests.cpp
@@ -1131,10 +1131,10 @@ BOOST_AUTO_TEST_CASE(script_CHECKMULTISIG23)
 
 BOOST_AUTO_TEST_CASE(script_size_and_capacity_test)
 {
-    BOOST_CHECK_EQUAL(sizeof(prevector<28, unsigned char>), 32);
-    BOOST_CHECK_EQUAL(sizeof(CScriptBase), 32);
-    BOOST_CHECK_EQUAL(sizeof(CScript), 32);
-    BOOST_CHECK_EQUAL(sizeof(CTxOut), 40);
+    BOOST_CHECK_EQUAL(sizeof(prevector<34, uint8_t>), sizeof(prevector<36, uint8_t>));
+    BOOST_CHECK_EQUAL(sizeof(CScriptBase), 40);
+    BOOST_CHECK_EQUAL(sizeof(CScript), 40);
+    BOOST_CHECK_EQUAL(sizeof(CTxOut), 48);
 
     CKey dummyKey;
     dummyKey.MakeNewKey(true);
@@ -1146,7 +1146,7 @@ BOOST_AUTO_TEST_CASE(script_size_and_capacity_test)
         const auto scriptSmallOpReturn{CScript() << OP_RETURN << std::vector<uint8_t>(10, 0xaa)};
         BOOST_CHECK_EQUAL(Solver(scriptSmallOpReturn, dummyVSolutions), TxoutType::NULL_DATA);
         BOOST_CHECK_EQUAL(scriptSmallOpReturn.size(), 12);
-        BOOST_CHECK_EQUAL(scriptSmallOpReturn.capacity(), 28);
+        BOOST_CHECK_EQUAL(scriptSmallOpReturn.capacity(), 36);
         BOOST_CHECK_EQUAL(scriptSmallOpReturn.allocated_memory(), 0);
     }
 
@@ -1155,7 +1155,7 @@ BOOST_AUTO_TEST_CASE(script_size_and_capacity_test)
         const auto scriptP2WPKH{GetScriptForDestination(WitnessV0KeyHash{PKHash{CKeyID{CPubKey{dummyKey.GetPubKey()}.GetID()}}})};
         BOOST_CHECK_EQUAL(Solver(scriptP2WPKH, dummyVSolutions), TxoutType::WITNESS_V0_KEYHASH);
         BOOST_CHECK_EQUAL(scriptP2WPKH.size(), 22);
-        BOOST_CHECK_EQUAL(scriptP2WPKH.capacity(), 28);
+        BOOST_CHECK_EQUAL(scriptP2WPKH.capacity(), 36);
         BOOST_CHECK_EQUAL(scriptP2WPKH.allocated_memory(), 0);
     }
 
@@ -1164,7 +1164,7 @@ BOOST_AUTO_TEST_CASE(script_size_and_capacity_test)
         const auto scriptP2SH{GetScriptForDestination(ScriptHash{CScript{} << OP_TRUE})};
         BOOST_CHECK(scriptP2SH.IsPayToScriptHash());
         BOOST_CHECK_EQUAL(scriptP2SH.size(), 23);
-        BOOST_CHECK_EQUAL(scriptP2SH.capacity(), 28);
+        BOOST_CHECK_EQUAL(scriptP2SH.capacity(), 36);
         BOOST_CHECK_EQUAL(scriptP2SH.allocated_memory(), 0);
     }
 
@@ -1173,35 +1173,35 @@ BOOST_AUTO_TEST_CASE(script_size_and_capacity_test)
         const auto scriptP2PKH{GetScriptForDestination(PKHash{CKeyID{CPubKey{dummyKey.GetPubKey()}.GetID()}})};
         BOOST_CHECK_EQUAL(Solver(scriptP2PKH, dummyVSolutions), TxoutType::PUBKEYHASH);
         BOOST_CHECK_EQUAL(scriptP2PKH.size(), 25);
-        BOOST_CHECK_EQUAL(scriptP2PKH.capacity(), 28);
+        BOOST_CHECK_EQUAL(scriptP2PKH.capacity(), 36);
         BOOST_CHECK_EQUAL(scriptP2PKH.allocated_memory(), 0);
     }
 
-    // P2WSH is heap allocated
+    // P2WSH is stack allocated
     {
         const auto scriptP2WSH{GetScriptForDestination(WitnessV0ScriptHash{CScript{} << OP_TRUE})};
         BOOST_CHECK(scriptP2WSH.IsPayToWitnessScriptHash());
         BOOST_CHECK_EQUAL(scriptP2WSH.size(), 34);
-        BOOST_CHECK_EQUAL(scriptP2WSH.capacity(), 34);
-        BOOST_CHECK_EQUAL(scriptP2WSH.allocated_memory(), 34);
+        BOOST_CHECK_EQUAL(scriptP2WSH.capacity(), 36);
+        BOOST_CHECK_EQUAL(scriptP2WSH.allocated_memory(), 0);
     }
 
-    // P2TR is heap allocated
+    // P2TR is stack allocated
     {
         const auto scriptTaproot{GetScriptForDestination(WitnessV1Taproot{XOnlyPubKey{CPubKey{dummyKey.GetPubKey()}}})};
         BOOST_CHECK_EQUAL(Solver(scriptTaproot, dummyVSolutions), TxoutType::WITNESS_V1_TAPROOT);
         BOOST_CHECK_EQUAL(scriptTaproot.size(), 34);
-        BOOST_CHECK_EQUAL(scriptTaproot.capacity(), 34);
-        BOOST_CHECK_EQUAL(scriptTaproot.allocated_memory(), 34);
+        BOOST_CHECK_EQUAL(scriptTaproot.capacity(), 36);
+        BOOST_CHECK_EQUAL(scriptTaproot.allocated_memory(), 0);
     }
 
-    // P2PK is heap allocated
+    // P2PK is stack allocated
     {
         const auto scriptPubKey{GetScriptForRawPubKey(CPubKey{dummyKey.GetPubKey()})};
         BOOST_CHECK_EQUAL(Solver(scriptPubKey, dummyVSolutions), TxoutType::PUBKEY);
         BOOST_CHECK_EQUAL(scriptPubKey.size(), 35);
-        BOOST_CHECK_EQUAL(scriptPubKey.capacity(), 35);
-        BOOST_CHECK_EQUAL(scriptPubKey.allocated_memory(), 35);
+        BOOST_CHECK_EQUAL(scriptPubKey.capacity(), 36);
+        BOOST_CHECK_EQUAL(scriptPubKey.allocated_memory(), 0);
     }
 
     // MULTISIG is always heap allocated
diff --git a/src/test/validation_flush_tests.cpp b/src/test/validation_flush_tests.cpp
index c325f7deb2b..b109c5be078 100644
--- a/src/test/validation_flush_tests.cpp
+++ b/src/test/validation_flush_tests.cpp
@@ -26,10 +26,8 @@ BOOST_AUTO_TEST_CASE(getcoinscachesizestate)
     LOCK(::cs_main);
     auto& view = chainstate.CoinsTip();
 
-    // The number of bytes consumed by coin's heap data, i.e. CScript
-    // (prevector<28, unsigned char>) when assigned 56 bytes of data per above.
-    //
-    // See also: Coin::DynamicMemoryUsage().
+    // The number of bytes consumed by coin's heap data, i.e. CScript (prevector<36, unsigned char>)
+    // when assigned 56 bytes of data per above. See also: Coin::DynamicMemoryUsage().
     constexpr unsigned int COIN_SIZE = is_64_bit ? 80 : 64;
 
     auto print_view_mem_usage = [](CCoinsViewCache& view) {