mirror of
https://github.com/bitcoin/bitcoin.git
synced 2025-04-29 14:59:39 -04:00
test: complete BDB parser (handle internal/overflow pages, support all page sizes)
This aims to complete our test framework BDB parser to reflect our read-only BDB parser in the wallet codebase. This could be useful both for making review of #26606 easier and to also possibly improve our functional tests for the BDB parser by comparing with an alternative implementation.
This commit is contained in:
parent
d94adc7270
commit
01ddd9f646
1 changed files with 95 additions and 36 deletions
|
@ -6,20 +6,18 @@
|
||||||
Utilities for working directly with the wallet's BDB database file
|
Utilities for working directly with the wallet's BDB database file
|
||||||
|
|
||||||
This is specific to the configuration of BDB used in this project:
|
This is specific to the configuration of BDB used in this project:
|
||||||
- pagesize: 4096 bytes
|
|
||||||
- Outer database contains single subdatabase named 'main'
|
- Outer database contains single subdatabase named 'main'
|
||||||
- btree
|
- btree
|
||||||
- btree leaf pages
|
- btree internal, leaf and overflow pages
|
||||||
|
|
||||||
Each key-value pair is two entries in a btree leaf. The first is the key, the one that follows
|
Each key-value pair is two entries in a btree leaf, which optionally refers to overflow pages
|
||||||
|
if the data doesn't fit into a single page. The first entry is the key, the one that follows
|
||||||
is the value. And so on. Note that the entry data is itself not in the correct order. Instead
|
is the value. And so on. Note that the entry data is itself not in the correct order. Instead
|
||||||
entry offsets are stored in the correct order and those offsets are needed to then retrieve
|
entry offsets are stored in the correct order and those offsets are needed to then retrieve
|
||||||
the data itself.
|
the data itself. Note that this implementation currently only supports reading databases that
|
||||||
|
are in the same endianness as the host.
|
||||||
|
|
||||||
Page format can be found in BDB source code dbinc/db_page.h
|
Page format can be found in BDB source code dbinc/db_page.h
|
||||||
This only implements the deserialization of btree metadata pages and normal btree pages. Overflow
|
|
||||||
pages are not implemented but may be needed in the future if dealing with wallets with large
|
|
||||||
transactions.
|
|
||||||
|
|
||||||
`db_dump -da wallet.dat` is useful to see the data in a wallet.dat BDB file
|
`db_dump -da wallet.dat` is useful to see the data in a wallet.dat BDB file
|
||||||
"""
|
"""
|
||||||
|
@ -27,23 +25,36 @@ transactions.
|
||||||
import struct
|
import struct
|
||||||
|
|
||||||
# Important constants
|
# Important constants
|
||||||
PAGESIZE = 4096
|
PAGE_HEADER_SIZE = 26
|
||||||
OUTER_META_PAGE = 0
|
OUTER_META_PAGE = 0
|
||||||
INNER_META_PAGE = 2
|
|
||||||
|
|
||||||
# Page type values
|
# Page type values
|
||||||
BTREE_INTERNAL = 3
|
BTREE_INTERNAL = 3
|
||||||
BTREE_LEAF = 5
|
BTREE_LEAF = 5
|
||||||
|
OVERFLOW_DATA = 7
|
||||||
BTREE_META = 9
|
BTREE_META = 9
|
||||||
|
|
||||||
|
# Record type values
|
||||||
|
RECORD_KEYDATA = 1
|
||||||
|
RECORD_OVERFLOW_DATA = 3
|
||||||
|
|
||||||
# Some magic numbers for sanity checking
|
# Some magic numbers for sanity checking
|
||||||
BTREE_MAGIC = 0x053162
|
BTREE_MAGIC = 0x053162
|
||||||
DB_VERSION = 9
|
DB_VERSION = 9
|
||||||
|
SUBDATABASE_NAME = b'main'
|
||||||
|
|
||||||
# Deserializes a leaf page into a dict.
|
# Deserializes an internal, leaf or overflow page into a dict.
|
||||||
# Btree internal pages have the same header, for those, return None.
|
# In addition to the common page header fields, the result contains an 'entries'
|
||||||
# For the btree leaf pages, deserialize them and put all the data into a dict
|
# array of dicts with the following fields, depending on the page type:
|
||||||
def dump_leaf_page(data):
|
# internal page [BTREE_INTERNAL]:
|
||||||
|
# - 'page_num': referenced page number (used to find further pages to process)
|
||||||
|
# leaf page [BTREE_LEAF]:
|
||||||
|
# - 'record_type': record type, must be RECORD_KEYDATA or RECORD_OVERFLOW_DATA
|
||||||
|
# - 'data': binary data (key or value payload), if record type is RECORD_KEYDATA
|
||||||
|
# - 'page_num': referenced overflow page number, if record type is RECORD_OVERFLOW_DATA
|
||||||
|
# overflow page [OVERFLOW_DATA]:
|
||||||
|
# - 'data': binary data (part of key or value payload)
|
||||||
|
def dump_page(data):
|
||||||
page_info = {}
|
page_info = {}
|
||||||
page_header = data[0:26]
|
page_header = data[0:26]
|
||||||
_, pgno, prev_pgno, next_pgno, entries, hf_offset, level, pg_type = struct.unpack('QIIIHHBB', page_header)
|
_, pgno, prev_pgno, next_pgno, entries, hf_offset, level, pg_type = struct.unpack('QIIIHHBB', page_header)
|
||||||
|
@ -56,20 +67,35 @@ def dump_leaf_page(data):
|
||||||
page_info['entry_offsets'] = struct.unpack('{}H'.format(entries), data[26:26 + entries * 2])
|
page_info['entry_offsets'] = struct.unpack('{}H'.format(entries), data[26:26 + entries * 2])
|
||||||
page_info['entries'] = []
|
page_info['entries'] = []
|
||||||
|
|
||||||
if pg_type == BTREE_INTERNAL:
|
assert pg_type in (BTREE_INTERNAL, BTREE_LEAF, OVERFLOW_DATA)
|
||||||
# Skip internal pages. These are the internal nodes of the btree and don't contain anything relevant to us
|
|
||||||
return None
|
|
||||||
|
|
||||||
assert pg_type == BTREE_LEAF, 'A non-btree leaf page has been encountered while dumping leaves'
|
if pg_type == OVERFLOW_DATA:
|
||||||
|
assert entries == 1
|
||||||
|
page_info['entries'].append({'data': data[26:26 + hf_offset]})
|
||||||
|
return page_info
|
||||||
|
|
||||||
for i in range(0, entries):
|
for i in range(0, entries):
|
||||||
|
entry = {}
|
||||||
offset = page_info['entry_offsets'][i]
|
offset = page_info['entry_offsets'][i]
|
||||||
entry = {'offset': offset}
|
record_header = data[offset:offset + 3]
|
||||||
page_data_header = data[offset:offset + 3]
|
offset += 3
|
||||||
e_len, pg_type = struct.unpack('HB', page_data_header)
|
e_len, record_type = struct.unpack('HB', record_header)
|
||||||
entry['len'] = e_len
|
|
||||||
entry['pg_type'] = pg_type
|
if pg_type == BTREE_INTERNAL:
|
||||||
entry['data'] = data[offset + 3:offset + 3 + e_len]
|
assert record_type == RECORD_KEYDATA
|
||||||
|
internal_record_data = data[offset:offset + 9]
|
||||||
|
_, page_num, _ = struct.unpack('=BII', internal_record_data)
|
||||||
|
entry['page_num'] = page_num
|
||||||
|
elif pg_type == BTREE_LEAF:
|
||||||
|
assert record_type in (RECORD_KEYDATA, RECORD_OVERFLOW_DATA)
|
||||||
|
entry['record_type'] = record_type
|
||||||
|
if record_type == RECORD_KEYDATA:
|
||||||
|
entry['data'] = data[offset:offset + e_len]
|
||||||
|
elif record_type == RECORD_OVERFLOW_DATA:
|
||||||
|
overflow_record_data = data[offset:offset + 9]
|
||||||
|
_, page_num, _ = struct.unpack('=BII', overflow_record_data)
|
||||||
|
entry['page_num'] = page_num
|
||||||
|
|
||||||
page_info['entries'].append(entry)
|
page_info['entries'].append(entry)
|
||||||
|
|
||||||
return page_info
|
return page_info
|
||||||
|
@ -115,16 +141,27 @@ def dump_meta_page(page):
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
# Given the dict from dump_leaf_page, get the key-value pairs and put them into a dict
|
# Given the dict from dump_leaf_page, get the key-value pairs and put them into a dict
|
||||||
def extract_kv_pairs(page_data):
|
def extract_kv_pairs(page_data, pages):
|
||||||
out = {}
|
out = {}
|
||||||
last_key = None
|
last_key = None
|
||||||
for i, entry in enumerate(page_data['entries']):
|
for i, entry in enumerate(page_data['entries']):
|
||||||
|
data = b''
|
||||||
|
if entry['record_type'] == RECORD_KEYDATA:
|
||||||
|
data = entry['data']
|
||||||
|
elif entry['record_type'] == RECORD_OVERFLOW_DATA:
|
||||||
|
next_page = entry['page_num']
|
||||||
|
while next_page != 0:
|
||||||
|
opage = pages[next_page]
|
||||||
|
opage_info = dump_page(opage)
|
||||||
|
data += opage_info['entries'][0]['data']
|
||||||
|
next_page = opage_info['next_pgno']
|
||||||
|
|
||||||
# By virtue of these all being pairs, even number entries are keys, and odd are values
|
# By virtue of these all being pairs, even number entries are keys, and odd are values
|
||||||
if i % 2 == 0:
|
if i % 2 == 0:
|
||||||
out[entry['data']] = b''
|
out[entry['data']] = b''
|
||||||
last_key = entry['data']
|
last_key = data
|
||||||
else:
|
else:
|
||||||
out[last_key] = entry['data']
|
out[last_key] = data
|
||||||
return out
|
return out
|
||||||
|
|
||||||
# Extract the key-value pairs of the BDB file given in filename
|
# Extract the key-value pairs of the BDB file given in filename
|
||||||
|
@ -132,20 +169,42 @@ def dump_bdb_kv(filename):
|
||||||
# Read in the BDB file and start deserializing it
|
# Read in the BDB file and start deserializing it
|
||||||
pages = []
|
pages = []
|
||||||
with open(filename, 'rb') as f:
|
with open(filename, 'rb') as f:
|
||||||
data = f.read(PAGESIZE)
|
# Determine pagesize first
|
||||||
|
data = f.read(PAGE_HEADER_SIZE)
|
||||||
|
pagesize = struct.unpack('I', data[20:24])[0]
|
||||||
|
assert pagesize in (512, 1024, 2048, 4096, 8192, 16384, 32768, 65536)
|
||||||
|
|
||||||
|
# Read rest of first page
|
||||||
|
data += f.read(pagesize - PAGE_HEADER_SIZE)
|
||||||
|
assert len(data) == pagesize
|
||||||
|
|
||||||
|
# Read all remaining pages
|
||||||
while len(data) > 0:
|
while len(data) > 0:
|
||||||
pages.append(data)
|
pages.append(data)
|
||||||
data = f.read(PAGESIZE)
|
data = f.read(pagesize)
|
||||||
|
|
||||||
# Sanity check the meta pages
|
# Sanity check the meta pages, read root page
|
||||||
dump_meta_page(pages[OUTER_META_PAGE])
|
outer_meta_info = dump_meta_page(pages[OUTER_META_PAGE])
|
||||||
dump_meta_page(pages[INNER_META_PAGE])
|
root_page_info = dump_page(pages[outer_meta_info['root']])
|
||||||
|
assert root_page_info['pg_type'] == BTREE_LEAF
|
||||||
|
assert len(root_page_info['entries']) == 2
|
||||||
|
assert root_page_info['entries'][0]['data'] == SUBDATABASE_NAME
|
||||||
|
assert len(root_page_info['entries'][1]['data']) == 4
|
||||||
|
inner_meta_page = int.from_bytes(root_page_info['entries'][1]['data'], 'big')
|
||||||
|
inner_meta_info = dump_meta_page(pages[inner_meta_page])
|
||||||
|
|
||||||
# Fetch the kv pairs from the leaf pages
|
# Fetch the kv pairs from the pages
|
||||||
kv = {}
|
kv = {}
|
||||||
for i in range(3, len(pages)):
|
pages_to_process = [inner_meta_info['root']]
|
||||||
info = dump_leaf_page(pages[i])
|
while len(pages_to_process) > 0:
|
||||||
if info is not None:
|
curr_page_no = pages_to_process.pop()
|
||||||
info_kv = extract_kv_pairs(info)
|
assert curr_page_no <= outer_meta_info['last_pgno']
|
||||||
|
info = dump_page(pages[curr_page_no])
|
||||||
|
assert info['pg_type'] in (BTREE_INTERNAL, BTREE_LEAF)
|
||||||
|
if info['pg_type'] == BTREE_INTERNAL:
|
||||||
|
for entry in info['entries']:
|
||||||
|
pages_to_process.append(entry['page_num'])
|
||||||
|
elif info['pg_type'] == BTREE_LEAF:
|
||||||
|
info_kv = extract_kv_pairs(info, pages)
|
||||||
kv = {**kv, **info_kv}
|
kv = {**kv, **info_kv}
|
||||||
return kv
|
return kv
|
||||||
|
|
Loading…
Add table
Reference in a new issue