test: complete BDB parser (handle internal/overflow pages, support all page sizes)

This aims to complete our test framework BDB parser to reflect our read-only BDB parser in the wallet codebase. This could be useful both for making review of #26606 easier and to also possibly improve our functional tests for the BDB parser by comparing with an alternative implementation.
2025-04-29 14:59:39 -04:00 · 2024-05-16 16:17:05 +02:00 · 2024-05-16 16:17:05 +02:00 · 01ddd9f646
commit 01ddd9f646
parent d94adc7270
1 changed files with 95 additions and 36 deletions
--- a/test/functional/test_framework/bdb.py
+++ b/test/functional/test_framework/bdb.py
@ -6,20 +6,18 @@
 Utilities for working directly with the wallet's BDB database file
 This is specific to the configuration of BDB used in this project:
    - pagesize: 4096 bytes
    - Outer database contains single subdatabase named 'main'
    - btree
-    - btree leaf pages
+    - btree internal, leaf and overflow pages
-Each key-value pair is two entries in a btree leaf. The first is the key, the one that follows
+Each key-value pair is two entries in a btree leaf, which optionally refers to overflow pages
 if the data doesn't fit into a single page. The first entry is the key, the one that follows
 is the value. And so on. Note that the entry data is itself not in the correct order. Instead
 entry offsets are stored in the correct order and those offsets are needed to then retrieve
-the data itself.
+the data itself. Note that this implementation currently only supports reading databases that
 are in the same endianness as the host.
 Page format can be found in BDB source code dbinc/db_page.h
 This only implements the deserialization of btree metadata pages and normal btree pages. Overflow
 pages are not implemented but may be needed in the future if dealing with wallets with large
 transactions.
 `db_dump -da wallet.dat` is useful to see the data in a wallet.dat BDB file
 """
@ -27,23 +25,36 @@ transactions.
 import struct
 # Important constants
-PAGESIZE = 4096
+PAGE_HEADER_SIZE = 26
 OUTER_META_PAGE = 0
 INNER_META_PAGE = 2
 # Page type values
 BTREE_INTERNAL = 3
 BTREE_LEAF = 5
 OVERFLOW_DATA = 7
 BTREE_META = 9
 # Record type values
 RECORD_KEYDATA = 1
 RECORD_OVERFLOW_DATA = 3
 # Some magic numbers for sanity checking
 BTREE_MAGIC = 0x053162
 DB_VERSION = 9
 SUBDATABASE_NAME = b'main'
-# Deserializes a leaf page into a dict.
+# Deserializes an internal, leaf or overflow page into a dict.
-# Btree internal pages have the same header, for those, return None.
+# In addition to the common page header fields, the result contains an 'entries'
-# For the btree leaf pages, deserialize them and put all the data into a dict
+# array of dicts with the following fields, depending on the page type:
-def dump_leaf_page(data):
+#     internal page [BTREE_INTERNAL]:
 #         - 'page_num': referenced page number (used to find further pages to process)
 #     leaf page [BTREE_LEAF]:
 #         - 'record_type': record type, must be RECORD_KEYDATA or RECORD_OVERFLOW_DATA
 #         - 'data': binary data (key or value payload),  if record type is RECORD_KEYDATA
 #         - 'page_num': referenced overflow page number, if record type is RECORD_OVERFLOW_DATA
 #     overflow page [OVERFLOW_DATA]:
 #         - 'data': binary data (part of key or value payload)
 def dump_page(data):
    page_info = {}
    page_header = data[0:26]
    _, pgno, prev_pgno, next_pgno, entries, hf_offset, level, pg_type = struct.unpack('QIIIHHBB', page_header)
@ -56,20 +67,35 @@ def dump_leaf_page(data):
    page_info['entry_offsets'] = struct.unpack('{}H'.format(entries), data[26:26 + entries * 2])
    page_info['entries'] = []
-    if pg_type == BTREE_INTERNAL:
+    assert pg_type in (BTREE_INTERNAL, BTREE_LEAF, OVERFLOW_DATA)
        # Skip internal pages. These are the internal nodes of the btree and don't contain anything relevant to us
        return None
-    assert pg_type == BTREE_LEAF, 'A non-btree leaf page has been encountered while dumping leaves'
+    if pg_type == OVERFLOW_DATA:
        assert entries == 1
        page_info['entries'].append({'data': data[26:26 + hf_offset]})
        return page_info
    for i in range(0, entries):
        entry = {}
        offset = page_info['entry_offsets'][i]
-        entry = {'offset': offset}
+        record_header = data[offset:offset + 3]
-        page_data_header = data[offset:offset + 3]
+        offset += 3
-        e_len, pg_type = struct.unpack('HB', page_data_header)
+        e_len, record_type = struct.unpack('HB', record_header)
-        entry['len'] = e_len
+
-        entry['pg_type'] = pg_type
+        if pg_type == BTREE_INTERNAL:
-        entry['data'] = data[offset + 3:offset + 3 + e_len]
+            assert record_type == RECORD_KEYDATA
            internal_record_data = data[offset:offset + 9]
            _, page_num, _ = struct.unpack('=BII', internal_record_data)
            entry['page_num'] = page_num
        elif pg_type == BTREE_LEAF:
            assert record_type in (RECORD_KEYDATA, RECORD_OVERFLOW_DATA)
            entry['record_type'] = record_type
            if record_type == RECORD_KEYDATA:
                entry['data'] = data[offset:offset + e_len]
            elif record_type == RECORD_OVERFLOW_DATA:
                overflow_record_data = data[offset:offset + 9]
                _, page_num, _ = struct.unpack('=BII', overflow_record_data)
                entry['page_num'] = page_num
        page_info['entries'].append(entry)
    return page_info
@ -115,16 +141,27 @@ def dump_meta_page(page):
    return metadata
 # Given the dict from dump_leaf_page, get the key-value pairs and put them into a dict
-def extract_kv_pairs(page_data):
+def extract_kv_pairs(page_data, pages):
    out = {}
    last_key = None
    for i, entry in enumerate(page_data['entries']):
        data = b''
        if entry['record_type'] == RECORD_KEYDATA:
            data = entry['data']
        elif entry['record_type'] == RECORD_OVERFLOW_DATA:
            next_page = entry['page_num']
            while next_page != 0:
                opage = pages[next_page]
                opage_info = dump_page(opage)
                data += opage_info['entries'][0]['data']
                next_page = opage_info['next_pgno']
        # By virtue of these all being pairs, even number entries are keys, and odd are values
        if i % 2 == 0:
            out[entry['data']] = b''
-            last_key = entry['data']
+            last_key = data
        else:
-            out[last_key] = entry['data']
+            out[last_key] = data
    return out
 # Extract the key-value pairs of the BDB file given in filename
@ -132,20 +169,42 @@ def dump_bdb_kv(filename):
    # Read in the BDB file and start deserializing it
    pages = []
    with open(filename, 'rb') as f:
-        data = f.read(PAGESIZE)
+        # Determine pagesize first
        data = f.read(PAGE_HEADER_SIZE)
        pagesize = struct.unpack('I', data[20:24])[0]
        assert pagesize in (512, 1024, 2048, 4096, 8192, 16384, 32768, 65536)
        # Read rest of first page
        data += f.read(pagesize - PAGE_HEADER_SIZE)
        assert len(data) == pagesize
        # Read all remaining pages
        while len(data) > 0:
            pages.append(data)
-            data = f.read(PAGESIZE)
+            data = f.read(pagesize)
-    # Sanity check the meta pages
+    # Sanity check the meta pages, read root page
-    dump_meta_page(pages[OUTER_META_PAGE])
+    outer_meta_info = dump_meta_page(pages[OUTER_META_PAGE])
-    dump_meta_page(pages[INNER_META_PAGE])
+    root_page_info = dump_page(pages[outer_meta_info['root']])
    assert root_page_info['pg_type'] == BTREE_LEAF
    assert len(root_page_info['entries']) == 2
    assert root_page_info['entries'][0]['data'] == SUBDATABASE_NAME
    assert len(root_page_info['entries'][1]['data']) == 4
    inner_meta_page = int.from_bytes(root_page_info['entries'][1]['data'], 'big')
    inner_meta_info = dump_meta_page(pages[inner_meta_page])
-    # Fetch the kv pairs from the leaf pages
+    # Fetch the kv pairs from the pages
    kv = {}
-    for i in range(3, len(pages)):
+    pages_to_process = [inner_meta_info['root']]
-        info = dump_leaf_page(pages[i])
+    while len(pages_to_process) > 0:
-        if info is not None:
+        curr_page_no = pages_to_process.pop()
-            info_kv = extract_kv_pairs(info)
+        assert curr_page_no <= outer_meta_info['last_pgno']
        info = dump_page(pages[curr_page_no])
        assert info['pg_type'] in (BTREE_INTERNAL, BTREE_LEAF)
        if info['pg_type'] == BTREE_INTERNAL:
            for entry in info['entries']:
                pages_to_process.append(entry['page_num'])
        elif info['pg_type'] == BTREE_LEAF:
            info_kv = extract_kv_pairs(info, pages)
            kv = {**kv, **info_kv}
    return kv