Merge 10845cd7cc into c5e44a0435

2025-04-29 14:59:39 -04:00 · 2025-04-29 12:02:22 +02:00 · 2025-04-29 12:02:22 +02:00 · 1ee8ad43c1
commit 1ee8ad43c1
parent c5e44a0435 10845cd7cc
4 changed files with 215 additions and 29 deletions
--- a/test/functional/feature_framework_startup_failures.py
+++ b/test/functional/feature_framework_startup_failures.py
@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+# Copyright (c) 2025-present The Bitcoin Core developers
+# Distributed under the MIT software license, see the accompanying
+# file COPYING or http://www.opensource.org/licenses/mit-license.php.
+"""
+Verify framework startup failures only raise one exception since
+multiple exceptions being raised muddies the waters of what actually
+went wrong. We should maintain this bar of only raising one exception as
+long as additional maintenance and complexity is low.
+
+Test relaunches itself into child processes in order to trigger failures
+without the parent process' BitcoinTestFramework also failing.
+"""
+
+from test_framework.util import (
+    assert_raises_message,
+    rpc_port,
+)
+from test_framework.test_framework import BitcoinTestFramework
+
+from hashlib import md5
+from os import linesep
+import re
+import subprocess
+import sys
+import time
+
+class FeatureFrameworkStartupFailures(BitcoinTestFramework):
+    def set_test_params(self):
+        self.num_nodes = 1
+
+    def setup_network(self):
+        # Don't start the node yet, as we want to measure during run_test.
+        self.add_nodes(self.num_nodes, self.extra_args)
+
+    # Launches a child test process that runs this same file, but instantiates
+    # a child test. Verifies that it raises only the expected exception, once.
+    def _verify_startup_failure(self, test, internal_args, expected_exception):
+        # Inherit args from parent, only modifying tmpdir so children don't fail
+        # as a cause of colliding with the parent dir.
+        parent_args = sys.argv.copy()
+        assert self.options.tmpdir, "Framework should always set tmpdir."
+        i, path = next(((i, m[1]) for i, arg in enumerate(parent_args) if (m := re.match(r'--tm(?:p(?:d(?:ir?)?)?)?=(.+)', arg))),
+                       (len(parent_args), self.options.tmpdir))
+        subdir = md5(expected_exception.encode('utf-8')).hexdigest()[:8]
+        parent_args[i:i + 1] = [f"--tmpdir={path}/{subdir}"]
+        args = [sys.executable] + parent_args + [f"--internal_test={test.__name__}"] + internal_args
+
+        try:
+            # The subprocess encoding argument gives different results for e.output
+            # on Linux/Windows, so handle decoding by ourselves for consistency.
+            output = subprocess.run(args, timeout=60 * self.options.timeout_factor,
+                                    stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout.decode("utf-8")
+        except subprocess.TimeoutExpired as e:
+            print("Unexpected child process timeout!\n"
+                  "WARNING: Timeouts like this halt execution of TestNode logic, "
+                  "meaning dangling bitcoind processes are to be expected.\n"
+                  f"<CHILD OUTPUT BEGIN>\n{e.output.decode('utf-8')}\n<CHILD OUTPUT END>",
+                  file=sys.stderr)
+            raise
+
+        errors = []
+        if (n := output.count("Traceback")) != 1:
+            errors.append(f"Found {n}/1 tracebacks - expecting exactly one with no knock-on exceptions.")
+        if (n := len(re.findall(expected_exception, output))) != 1:
+            errors.append(f"Found {n}/1 occurrences of the specific exception: {expected_exception}")
+        if (n := output.count("Test failed. Test logging available at")) != 1:
+            errors.append(f"Found {n}/1 test failure output messages.")
+
+        assert not errors, f"Child test didn't contain (only) expected errors:\n{linesep.join(errors)}\n<CHILD OUTPUT BEGIN>\n{output}\n<CHILD OUTPUT END>\n"
+
+    def run_test(self):
+        self.log.info("Verifying _verify_startup_failure() functionality (self-check).")
+        assert_raises_message(
+            AssertionError,
+            ("Child test didn't contain (only) expected errors:\n" +
+             linesep.join(["Found 0/1 tracebacks - expecting exactly one with no knock-on exceptions.",
+                           "Found 0/1 occurrences of the specific exception: NonExistentError",
+                           "Found 0/1 test failure output messages."])).encode("unicode_escape").decode("utf-8"),
+            self._verify_startup_failure,
+            TestSuccess, [],
+            "NonExistentError",
+        )
+
+        self.log.info("Parent process is measuring node startup duration in order to obtain a reasonable timeout value for later test...")
+        node_start_time = time.time()
+        self.nodes[0].start()
+        self.nodes[0].wait_for_rpc_connection()
+        node_start_duration = time.time() - node_start_time
+        self.nodes[0].stop_node()
+        self.log.info(f"...measured {node_start_duration:.1f}s.")
+
+        self.log.info("Verifying inability to connect to bitcoind's RPC interface due to wrong port results in one exception containing at least one OSError.")
+        self._verify_startup_failure(
+            TestWrongRpcPortStartupFailure, [f"--internal_node_start_duration={node_start_duration}"],
+            r"AssertionError: \[node 0\] Unable to connect to bitcoind after \d+s \(ignored errors: {[^}]*'OSError \w+'?: \d+[^}]*}, latest error: \w+\([^)]+\)\)"
+        )
+
+        self.log.info("Verifying startup failure due to invalid arg results in only one exception.")
+        self._verify_startup_failure(
+            TestInitErrorStartupFailure, [],
+            r"FailedToStartError: \[node 0\] bitcoind exited with status 1 during initialization\. Error: Error parsing command line arguments: Invalid parameter -nonexistentarg"
+        )
+
+        self.log.info("Verifying start() then stop_node() on a node without wait_for_rpc_connection() in between raises an assert.")
+        self._verify_startup_failure(
+            TestStartStopStartupFailure, [],
+            r"AssertionError: \[node 0\] Should only call stop_node\(\) on a running node after wait_for_rpc_connection\(\) succeeded\. Did you forget to call the latter after start\(\)\? Not connected to process: \d+"
+        )
+
+class InternalTestMixin:
+    def add_options(self, parser):
+        # Just here to silence unrecognized argument error, we actually read the value in the if-main at the bottom.
+        parser.add_argument("--internal_test", dest="internal_never_read", help="ONLY TO BE USED WHEN TEST RELAUNCHES ITSELF")
+
+class TestWrongRpcPortStartupFailure(InternalTestMixin, BitcoinTestFramework):
+    def add_options(self, parser):
+        parser.add_argument("--internal_node_start_duration", dest="node_start_duration", help="ONLY TO BE USED WHEN TEST RELAUNCHES ITSELF", type=float)
+        InternalTestMixin.add_options(self, parser)
+
+    def set_test_params(self):
+        self.num_nodes = 1
+        # Override RPC listen port to something TestNode isn't expecting so that
+        # we are unable to establish an RPC connection.
+        self.extra_args = [[f"-rpcport={rpc_port(2)}"]]
+        # Override the timeout to avoid waiting unnecessarily long to realize
+        # nothing is on that port. Divide by timeout_factor to counter
+        # multiplication in base, 2 * node_start_duration should be enough.
+        self.rpc_timeout = max(3, 2 * self.options.node_start_duration) / self.options.timeout_factor
+
+    def run_test(self):
+        assert False, "Should have failed earlier during startup."
+
+class TestInitErrorStartupFailure(InternalTestMixin, BitcoinTestFramework):
+    def set_test_params(self):
+        self.num_nodes = 1
+        self.extra_args = [["-nonexistentarg"]]
+
+    def run_test(self):
+        assert False, "Should have failed earlier during startup."
+
+class TestStartStopStartupFailure(InternalTestMixin, BitcoinTestFramework):
+    def set_test_params(self):
+        self.num_nodes = 1
+
+    def setup_network(self):
+        self.add_nodes(self.num_nodes, self.extra_args)
+        self.nodes[0].start()
+        self.nodes[0].stop_node()
+        assert False, "stop_node() should raise an exception when we haven't called wait_for_rpc_connection()"
+
+    def run_test(self):
+        assert False, "Should have failed earlier during startup."
+
+class TestSuccess(InternalTestMixin, BitcoinTestFramework):
+    def set_test_params(self):
+        self.num_nodes = 1
+
+    def setup_network(self):
+        pass # Don't need to start our node.
+
+    def run_test(self):
+        pass # Just succeed.
+
+
+if __name__ == '__main__':
+    if class_name := next((m[1] for arg in sys.argv[1:] if (m := re.match(r'--internal_test=(.+)', arg))), None):
+        internal_test = globals().get(class_name)
+        assert internal_test, f"Unrecognized test: {class_name}"
+        internal_test(__file__).main()
+    else:
+        FeatureFrameworkStartupFailures(__file__).main()
--- a/test/functional/test_framework/test_node.py
+++ b/test/functional/test_framework/test_node.py
@ -8,7 +8,6 @@ import contextlib
 import decimal
 import errno
 from enum import Enum
-import http.client
 import json
 import logging
 import os
@ -264,6 +263,8 @@ class TestNode():
        """Sets up an RPC connection to the bitcoind process. Returns False if unable to connect."""
        # Poll at a rate of four times per second
        poll_per_s = 4
+        suppressed_errors = collections.defaultdict(int)
+        latest_error = ""
        for _ in range(poll_per_s * self.rpc_timeout):
            if self.process.poll() is not None:
                # Attach abrupt shutdown error/s to the exception message
@ -304,33 +305,46 @@ class TestNode():
                    # overhead is trivial, and the added guarantees are worth
                    # the minimal performance cost.
                self.log.debug("RPC successfully started")
+                # Set rpc_connected even if we are in use_cli mode so that we know we can call self.stop() if needed.
+                self.rpc_connected = True
                if self.use_cli:
                    return
                self.rpc = rpc
-                self.rpc_connected = True
                self.url = self.rpc.rpc_url
                return
-            except JSONRPCException as e:  # Initialization phase
+            except JSONRPCException as e:
+                # Suppress these as they are expected during initialization.
                # -28 RPC in warmup
-                # -342 Service unavailable, RPC server started but is shutting down due to error
-                if e.error['code'] != -28 and e.error['code'] != -342:
+                # -342 Service unavailable, could be starting up or shutting down
+                if e.error['code'] not in [-28, -342]:
                    raise  # unknown JSON RPC exception
-            except ConnectionResetError:
-                # This might happen when the RPC server is in warmup, but shut down before the call to getblockcount
-                # succeeds. Try again to properly raise the FailedToStartError
-                pass
+                suppressed_errors[f"JSONRPCException {e.error['code']}"] += 1
+                latest_error = repr(e)
            except OSError as e:
-                if e.errno == errno.ETIMEDOUT:
-                    pass  # Treat identical to ConnectionResetError
-                elif e.errno == errno.ECONNREFUSED:
-                    pass  # Port not yet open?
-                else:
+                error_num = e.errno
+                # Work around issue where socket timeouts don't have errno set.
+                # https://github.com/python/cpython/issues/109601
+                if error_num is None and isinstance(e, TimeoutError):
+                    error_num = errno.ETIMEDOUT
+
+                # Suppress similarly to the above JSONRPCException errors.
+                if error_num not in [
+                    errno.ECONNRESET,   # This might happen when the RPC server is in warmup,
+                                        # but shut down before the call to getblockcount succeeds.
+                    errno.ETIMEDOUT,    # Treat identical to ECONNRESET
+                    errno.ECONNREFUSED  # Port not yet open?
+                ]:
                    raise  # unknown OS error
-            except ValueError as e:  # cookie file not found and no rpcuser or rpcpassword; bitcoind is still starting
+                suppressed_errors[f"OSError {errno.errorcode[error_num]}"] += 1
+                latest_error = repr(e)
+            except ValueError as e:
+                # Suppress if cookie file isn't generated yet and no rpcuser or rpcpassword; bitcoind may be starting.
                if "No RPC credentials" not in str(e):
                    raise
+                suppressed_errors["missing_credentials"] += 1
+                latest_error = repr(e)
            time.sleep(1.0 / poll_per_s)
-        self._raise_assertion_error("Unable to connect to bitcoind after {}s".format(self.rpc_timeout))
+        self._raise_assertion_error(f"Unable to connect to bitcoind after {self.rpc_timeout}s (ignored errors: {str(dict(suppressed_errors))}, latest error: {latest_error})")

    def wait_for_cookie_credentials(self):
        """Ensures auth cookie credentials can be read, e.g. for testing CLI with -rpcwait before RPC connection is up."""
@ -387,15 +401,15 @@ class TestNode():
        """Stop the node."""
        if not self.running:
            return
+        assert self.rpc_connected, self._node_msg(
+            "Should only call stop_node() on a running node after wait_for_rpc_connection() succeeded. "
+            f"Did you forget to call the latter after start()? Not connected to process: {self.process.pid}")
        self.log.debug("Stopping node")
-        try:
-            # Do not use wait argument when testing older nodes, e.g. in wallet_backwards_compatibility.py
-            if self.version_is_at_least(180000):
-                self.stop(wait=wait)
-            else:
-                self.stop()
-        except http.client.CannotSendRequest:
-            self.log.exception("Unable to stop node.")
+        # Do not use wait argument when testing older nodes, e.g. in wallet_backwards_compatibility.py
+        if self.version_is_at_least(180000):
+            self.stop(wait=wait)
+        else:
+            self.stop()

        # If there are any running perf processes, stop them.
        for profile_name in tuple(self.perf_subprocesses.keys()):
--- a/test/functional/test_framework/util.py
+++ b/test/functional/test_framework/util.py
@ -101,10 +101,9 @@ def assert_raises_message(exc, message, fun, *args, **kwds):
    except JSONRPCException:
        raise AssertionError("Use assert_raises_rpc_error() to test RPC failures")
    except exc as e:
-        if message is not None and message not in e.error['message']:
-            raise AssertionError(
-                "Expected substring not found in error message:\nsubstring: '{}'\nerror message: '{}'.".format(
-                    message, e.error['message']))
+        if message is not None and message not in repr(e):
+            raise AssertionError("Expected substring not found in exception:\n"
+                                 f"substring: '{message}'\nexception: {repr(e)}.")
    except Exception as e:
        raise AssertionError("Unexpected exception raised: " + type(e).__name__)
    else:
--- a/test/functional/test_runner.py
+++ b/test/functional/test_runner.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright (c) 2014-2022 The Bitcoin Core developers
+# Copyright (c) 2014-present The Bitcoin Core developers
 # Distributed under the MIT software license, see the accompanying
 # file COPYING or http://www.opensource.org/licenses/mit-license.php.
 """Run regression test suite.
@ -357,6 +357,7 @@ BASE_SCRIPTS = [
    'p2p_handshake.py --v2transport',
    'feature_dirsymlinks.py',
    'feature_help.py',
+    'feature_framework_startup_failures.py',
    'feature_shutdown.py',
    'wallet_migration.py',
    'p2p_ibd_txrelay.py',