handle worker crash (#158)

xoviat · Michael Howitz · web-flow · commit 51c8d36a33b9 · 2021-09-17T08:01:37.000+02:00
Rerun test on crash with xdist.

If a  test crashes the worker while rerunfailures is running with a supported xdist, then the test will now be rerun.

This new behavior:
1. requires pytest 6 or newer and a supported xdist
2. doesn't increase the number of times a worker can crash
3. allows rerunning a timed-out test when combined with pytest-timeout

Co-authored-by: Michael Howitz &lt;mh@gocept.com&gt;
Co-authored-by: xoviat &lt;xoviat@users.noreply.github.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@ pytest_rerunfailures.egg-info/
 .Python
 .cache/
 .idea/
+.vscode/
 .python-version
 .tox*
 bin/
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -4,6 +4,10 @@ Changelog
 10.2 (unreleased)
 -----------------
 
+Features
+++++++++
+
+- Allow recovery from crashed tests with pytest-xdist.
 - Add support for Python 3.10 (as of Python 3.10.rc2).
   (Thanks to `@hugovk <https://github.com/hugovk>`_ for the PR.)
 
diff --git a/README.rst b/README.rst
@@ -22,6 +22,11 @@ You will need the following prerequisites in order to use pytest-rerunfailures:
 - Python 3.6, up to 3.10, or PyPy3
 - pytest 5.3 or newer
 
+This plugin can recover from a hard crash with the following optional
+prerequisites:
+
+- pytest-xdist 2.3.0 or newer
+
 This package is currently tested against the last 5 minor pytest releases. In
 case you work with an older version of pytest you should consider updating or
 use one of the earlier versions of this package.
@@ -35,6 +40,17 @@ To install pytest-rerunfailures:
 
   $ pip install pytest-rerunfailures
 
+Recover from hard crashes
+-------------------------
+
+If one or more tests trigger a hard crash (for example: segfault), this plugin
+will ordinarily be unable to rerun the test. However, if a compatible version of
+pytest-xdist is installed, and the tests are run within pytest-xdist using the `-n`
+flag, this plugin will be able to rerun crashed tests, assuming the workers and
+controller are on the same LAN (this assumption is valid for almost all cases
+because most of the time the workers and controller are on the same computer).
+If this assumption is not the case, then this functionality may not operate.
+
 Re-run all failures
 -------------------
 
diff --git a/pytest_rerunfailures.py b/pytest_rerunfailures.py
@@ -1,10 +1,14 @@
+import hashlib
 import os
 import platform
 import re
+import socket
 import sys
+import threading
 import time
 import traceback
 import warnings
+from contextlib import suppress
 
 import pytest
 from _pytest.outcomes import fail
@@ -23,9 +27,16 @@
     # We have a pytest >= 6.1
     pass
 
+try:
+    from xdist.newhooks import pytest_handlecrashitem
 
-PYTEST_GTE_54 = parse_version(pytest.__version__) >= parse_version("5.4")
+    HAS_PYTEST_HANDLECRASHITEM = True
+    del pytest_handlecrashitem
+except ImportError:
+    HAS_PYTEST_HANDLECRASHITEM = False
 
+
+PYTEST_GTE_54 = parse_version(pytest.__version__) >= parse_version("5.4")
 PYTEST_GTE_63 = parse_version(pytest.__version__) >= parse_version("6.3.0.dev")
 
 
@@ -78,16 +89,6 @@ def pytest_addoption(parser):
     )
 
 
-def pytest_configure(config):
-    # add flaky marker
-    config.addinivalue_line(
-        "markers",
-        "flaky(reruns=1, reruns_delay=0): mark test to re-run up "
-        "to 'reruns' times. Add a delay of 'reruns_delay' seconds "
-        "between re-runs.",
-    )
-
-
 def _get_resultlog(config):
     if not HAS_RESULTLOG:
         return None
@@ -302,6 +303,167 @@ def _should_not_rerun(item, report, reruns):
     )
 
 
+def is_master(config):
+    return not (hasattr(config, "workerinput") or hasattr(config, "slaveinput"))
+
+
+def pytest_configure(config):
+    # add flaky marker
+    config.addinivalue_line(
+        "markers",
+        "flaky(reruns=1, reruns_delay=0): mark test to re-run up "
+        "to 'reruns' times. Add a delay of 'reruns_delay' seconds "
+        "between re-runs.",
+    )
+
+    if HAS_PYTEST_HANDLECRASHITEM:
+        if is_master(config):
+            config.failures_db = ServerStatusDB()
+        else:
+            config.failures_db = ClientStatusDB(config.workerinput["sock_port"])
+    else:
+        config.failures_db = StatusDB()  # no-op db
+
+
+if HAS_PYTEST_HANDLECRASHITEM:
+
+    def pytest_configure_node(node):
+        """xdist hook"""
+        node.workerinput["sock_port"] = node.config.failures_db.sock_port
+
+    def pytest_handlecrashitem(crashitem, report, sched):
+        """
+        Return the crashitem from pending and collection.
+        """
+        db = sched.config.failures_db
+        reruns = db.get_test_reruns(crashitem)
+        if db.get_test_failures(crashitem) < reruns:
+            sched.mark_test_pending(crashitem)
+            report.outcome = "rerun"
+
+        db.add_test_failure(crashitem)
+
+
+# An in-memory db residing in the master that records
+# the number of reruns (set before test setup)
+# and failures (set after each failure or crash)
+# accessible from both the master and worker
+class StatusDB:
+    def __init__(self):
+        self.delim = b"\n"
+        self.hmap = {}
+
+    def _hash(self, crashitem: str) -> str:
+        if crashitem not in self.hmap:
+            self.hmap[crashitem] = hashlib.sha1(
+                crashitem.encode(),
+            ).hexdigest()[:10]
+
+        return self.hmap[crashitem]
+
+    def add_test_failure(self, crashitem):
+        hash = self._hash(crashitem)
+        failures = self._get(hash, "f")
+        failures += 1
+        self._set(hash, "f", failures)
+
+    def get_test_failures(self, crashitem):
+        hash = self._hash(crashitem)
+        return self._get(hash, "f")
+
+    def set_test_reruns(self, crashitem, reruns):
+        hash = self._hash(crashitem)
+        self._set(hash, "r", reruns)
+
+    def get_test_reruns(self, crashitem):
+        hash = self._hash(crashitem)
+        return self._get(hash, "r")
+
+    # i is a hash of the test name, t_f.py::test_t
+    # k is f for failures or r for reruns
+    # v is the number of failures or reruns (an int)
+    def _set(self, i: str, k: str, v: int):
+        pass
+
+    def _get(self, i: str, k: str) -> int:
+        return 0
+
+
+class SocketDB(StatusDB):
+    def __init__(self):
+        super().__init__()
+        self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        self.sock.setblocking(1)
+
+    def _sock_recv(self, conn) -> str:
+        buf = b""
+        while True:
+            b = conn.recv(1)
+            if b == self.delim:
+                break
+            buf += b
+
+        return buf.decode()
+
+    def _sock_send(self, conn, msg: str):
+        conn.send(msg.encode() + self.delim)
+
+
+class ServerStatusDB(SocketDB):
+    def __init__(self):
+        super().__init__()
+        self.sock.bind(("", 0))
+        self.sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+
+        self.rerunfailures_db = {}
+        t = threading.Thread(target=self.run_server, daemon=True)
+        t.start()
+
+    @property
+    def sock_port(self):
+        return self.sock.getsockname()[1]
+
+    def run_server(self):
+        self.sock.listen()
+        while True:
+            conn, _ = self.sock.accept()
+            t = threading.Thread(target=self.run_connection, args=(conn,), daemon=True)
+            t.start()
+
+    def run_connection(self, conn):
+        with suppress(ConnectionError):
+            while True:
+                op, i, k, v = self._sock_recv(conn).split("|")
+                if op == "set":
+                    self._set(i, k, int(v))
+                elif op == "get":
+                    self._sock_send(conn, str(self._get(i, k)))
+
+    def _set(self, i: str, k: str, v: int):
+        if i not in self.rerunfailures_db:
+            self.rerunfailures_db[i] = {}
+        self.rerunfailures_db[i][k] = v
+
+    def _get(self, i: str, k: str) -> int:
+        try:
+            return self.rerunfailures_db[i][k]
+        except KeyError:
+            return 0
+
+
+class ClientStatusDB(SocketDB):
+    def __init__(self, sock_port):
+        super().__init__()
+        self.sock.connect(("localhost", sock_port))
+
+    def _set(self, i: str, k: str, v: int):
+        self._sock_send(self.sock, "|".join(("set", i, k, str(v))))
+
+    def _get(self, i: str, k: str) -> int:
+        self._sock_send(self.sock, "|".join(("get", i, k, "")))
+        return int(self._sock_recv(self.sock))
+
+
 def pytest_runtest_protocol(item, nextitem):
     """
     Run the test protocol.
@@ -319,8 +481,14 @@ def pytest_runtest_protocol(item, nextitem):
     # first item if necessary
     check_options(item.session.config)
     delay = get_reruns_delay(item)
-    parallel = hasattr(item.config, "slaveinput") or hasattr(item.config, "workerinput")
-    item.execution_count = 0
+    parallel = not is_master(item.config)
+    item_location = (item.location[0] + "::" + item.location[2]).replace("\\", "/")
+    db = item.session.config.failures_db
+    item.execution_count = db.get_test_failures(item_location)
+    db.set_test_reruns(item_location, reruns)
+
+    if item.execution_count > reruns:
+        return True
 
     need_to_run = True
     while need_to_run:
diff --git a/test_pytest_rerunfailures.py b/test_pytest_rerunfailures.py
@@ -5,12 +5,14 @@
 import pytest
 from pkg_resources import parse_version
 
+from pytest_rerunfailures import HAS_PYTEST_HANDLECRASHITEM
+
 
 pytest_plugins = "pytester"
 
 PYTEST_GTE_60 = parse_version(pytest.__version__) >= parse_version("6.0")
-
 PYTEST_GTE_61 = parse_version(pytest.__version__) >= parse_version("6.1")
+has_xdist = HAS_PYTEST_HANDLECRASHITEM and PYTEST_GTE_61
 
 
 def temporary_failure(count=1):
@@ -23,6 +25,17 @@ def temporary_failure(count=1):
                 raise Exception('Failure: {{0}}'.format(count))"""
 
 
+def temporary_crash(count=1):
+    return f"""
+            import py
+            import os
+            path = py.path.local(__file__).dirpath().ensure('test.res')
+            count = path.read() or 1
+            if int(count) <= {count}:
+                path.write(int(count) + 1)
+                os._exit(1)"""
+
+
 def check_outcome_field(outcomes, field_name, expected_value):
     field_value = outcomes.get(field_name, 0)
     assert field_value == expected_value, (
@@ -168,6 +181,23 @@ def test_pass():
     assert_outcomes(result, passed=1, rerun=1)
 
 
+@pytest.mark.skipif(not has_xdist, reason="requires xdist with crashitem")
+def test_rerun_passes_after_temporary_test_crash(testdir):
+    # note: we need two tests because there is a bug where xdist
+    # cannot rerun the last test if it crashes. the bug exists only
+    # in xdist is there is no error that causes the bug in this plugin.
+    testdir.makepyfile(
+        f"""
+        def test_crash():
+            {temporary_crash()}
+
+        def test_pass():
+            pass"""
+    )
+    result = testdir.runpytest("-n", "1", "--reruns", "1", "-r", "R")
+    assert_outcomes(result, passed=2, rerun=1)
+
+
 def test_rerun_passes_after_temporary_test_failure_with_flaky_mark(testdir):
     testdir.makepyfile(
         f"""
diff --git a/tox.ini b/tox.ini
@@ -18,6 +18,7 @@ minversion = 3.17.1
 [testenv]
 commands = pytest test_pytest_rerunfailures.py {posargs}
 deps =
+    pytest-xdist
     pytest53: pytest==5.3.*
     pytest54: pytest==5.4.*
     pytest60: pytest==6.0.*