Merge pull request #3888 from Zac-HD/ghostwriter-operator-upgrades

Add a research note on the Ghostwriter + improve binop test generation
HypothesisWorks · Feb 18, 2024 · 314480c · 314480c
2 parents ed231ae + 35546c4
commit 314480c
Show file tree

Hide file tree

Showing 19 changed files with 281 additions and 91 deletions.
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,3 @@
+RELEASE_TYPE: patch
+
+This patch improves :doc:`the Ghostwriter <ghostwriter>` for binary operators.
diff --git a/hypothesis-python/docs/ghostwriter.rst b/hypothesis-python/docs/ghostwriter.rst
@@ -4,3 +4,62 @@ Ghostwriting tests for you
 
 .. automodule:: hypothesis.extra.ghostwriter
    :members:
+
+A note for test-generation researchers
+--------------------------------------
+
+Ghostwritten tests are intended as a *starting point for human authorship*,
+to demonstrate best practice, help novices past blank-page paralysis, and save time
+for experts.  They *may* be ready-to-run, or include placeholders and ``# TODO:``
+comments to fill in strategies for unknown types.  In either case, improving tests
+for their own code gives users a well-scoped and immediately rewarding context in
+which to explore property-based testing.
+
+By contrast, most test-generation tools aim to produce ready-to-run test suites...
+and implicitly assume that the current behavior is the desired behavior.
+However, the code might contain bugs, and we want our tests to fail if it does!
+Worse, tools require that the code to be tested is finished and executable,
+making it impossible to generate tests as part of the development process.
+
+`Fraser 2013`_ found that evolving a high-coverage test suite (e.g. Randoop_, EvoSuite_, Pynguin_)
+"leads to clear improvements in commonly applied quality metrics such as code coverage
+[but] no measurable improvement in the number of bugs actually found by developers"
+and that "generating a set of test cases, even high coverage test cases,
+does not necessarily improve our ability to test software".
+Invariant detection (famously Daikon_; in PBT see e.g. `Alonso 2022`_,
+QuickSpec_, Speculate_) relies on code execution. Program slicing (e.g. FUDGE_,
+FuzzGen_, WINNIE_) requires downstream consumers of the code to test.
+
+Ghostwriter inspects the function name, argument names and types, and docstrings.
+It can be used on buggy or incomplete code, runs in a few seconds, and produces
+a single semantically-meaningful test per function or group of functions.
+Rather than detecting regressions, these tests check semantic properties such as
+`encode/decode or save/load round-trips <https://zhd.dev/ghostwriter/?q=gzip.compress>`__,
+for `commutative, associative, and distributive operations
+<https://zhd.dev/ghostwriter/?q=operator.mul>`__,
+`equivalence between methods <https://zhd.dev/ghostwriter/?q=operator.add+numpy.add>`__,
+`array shapes <https://zhd.dev/ghostwriter/?q=numpy.matmul>`__,
+and idempotence.  Where no property is detected, we simply check for
+'no error on valid input' and allow the user to supply their own invariants.
+
+Evaluations such as the SBFT24_ competition_ measure performance on a task which
+the Ghostwriter is not intended to perform.  I'd love to see qualitative user
+studies, such as `PBT in Practice`_ for test generation, which could check
+whether the Ghostwriter is onto something or tilting at windmills.
+If you're interested in similar questions, `drop me an email`_!
+
+.. _Daikon: https://plse.cs.washington.edu/daikon/pubs/
+.. _Alonso 2022: https://doi.org/10.1145/3540250.3559080
+.. _QuickSpec: http://www.cse.chalmers.se/~nicsma/papers/quickspec2.pdf
+.. _Speculate: https://matela.com.br/paper/speculate.pdf
+.. _FUDGE: https://research.google/pubs/pub48314/
+.. _FuzzGen: https://www.usenix.org/conference/usenixsecurity20/presentation/ispoglou
+.. _WINNIE: https://www.ndss-symposium.org/wp-content/uploads/2021-334-paper.pdf
+.. _Fraser 2013: https://doi.org/10.1145/2483760.2483774
+.. _Randoop: https://homes.cs.washington.edu/~mernst/pubs/feedback-testgen-icse2007.pdf
+.. _EvoSuite: https://www.evosuite.org/wp-content/papercite-data/pdf/esecfse11.pdf
+.. _Pynguin: https://arxiv.org/abs/2007.14049
+.. _SBFT24: https://arxiv.org/abs/2401.15189
+.. _competition: https://github.com/ThunderKey/python-tool-competition-2024
+.. _PBT in Practice: https://harrisongoldste.in/papers/icse24-pbt-in-practice.pdf
+.. _drop me an email: mailto:[email protected]?subject=Hypothesis%20Ghostwriter%20research
diff --git a/hypothesis-python/src/hypothesis/extra/ghostwriter.py b/hypothesis-python/src/hypothesis/extra/ghostwriter.py
@@ -732,9 +732,10 @@ def _get_module_helper(obj):
 
     dots = [i for i, c in enumerate(module_name) if c == "."] + [None]
     for idx in dots:
-        if getattr(sys.modules.get(module_name[:idx]), obj.__name__, None) is obj:
-            KNOWN_FUNCTION_LOCATIONS[obj] = module_name[:idx]
-            return module_name[:idx]
+        for candidate in (module_name[:idx].lstrip("_"), module_name[:idx]):
+            if getattr(sys.modules.get(candidate), obj.__name__, None) is obj:
+                KNOWN_FUNCTION_LOCATIONS[obj] = candidate
+                return candidate
     return module_name
 
 
@@ -763,7 +764,7 @@ def _get_qualname(obj, *, include_module=False):
 
 
 def _write_call(
-    func: Callable, *pass_variables: str, except_: Except, assign: str = ""
+    func: Callable, *pass_variables: str, except_: Except = Exception, assign: str = ""
 ) -> str:
     """Write a call to `func` with explicit and implicit arguments.
 
@@ -1268,11 +1269,29 @@ def make_(how, *args, **kwargs):
         hints = get_type_hints(func)
         hints.pop("return", None)
         params = _get_params(func)
-        if len(hints) == len(params) == 2:
-            a, b = hints.values()
+        if (len(hints) == len(params) == 2) or (
+            _get_module(func) == "operator"
+            and "item" not in func.__name__
+            and tuple(params) in [("a", "b"), ("x", "y")]
+        ):
+            a, b = hints.values() or [Any, Any]
             arg1, arg2 = params
             if a == b and len(arg1) == len(arg2) <= 3:
-                make_(_make_binop_body, func, annotate=annotate)
+                # https://en.wikipedia.org/wiki/Distributive_property#Other_examples
+                known = {
+                    "mul": "add",
+                    "matmul": "add",
+                    "or_": "and_",
+                    "and_": "or_",
+                }.get(func.__name__, "")
+                distributes_over = getattr(sys.modules[_get_module(func)], known, None)
+                make_(
+                    _make_binop_body,
+                    func,
+                    commutative=func.__name__ != "matmul",
+                    distributes_over=distributes_over,
+                    annotate=annotate,
+                )
                 del by_name[name]
 
     # Look for Numpy ufuncs or gufuncs, and write array-oriented tests for them.
@@ -1477,10 +1496,17 @@ def roundtrip(
     return _make_test(*_make_roundtrip_body(funcs, except_, style, annotate))
 
 
-def _make_equiv_body(funcs, except_, style, annotate):
+def _get_varnames(funcs):
     var_names = [f"result_{f.__name__}" for f in funcs]
     if len(set(var_names)) < len(var_names):
-        var_names = [f"result_{i}_{ f.__name__}" for i, f in enumerate(funcs)]
+        var_names = [f"result_{f.__name__}_{_get_module(f)}" for f in funcs]
+    if len(set(var_names)) < len(var_names):
+        var_names = [f"result_{i}_{f.__name__}" for i, f in enumerate(funcs)]
+    return var_names
+
+
+def _make_equiv_body(funcs, except_, style, annotate):
+    var_names = _get_varnames(funcs)
     test_lines = [
         _write_call(f, assign=vname, except_=except_)
         for vname, f in zip(var_names, funcs)
@@ -1520,10 +1546,7 @@ def _make_equiv_body(funcs, except_, style, annotate):
 
 
 def _make_equiv_errors_body(funcs, except_, style, annotate):
-    var_names = [f"result_{f.__name__}" for f in funcs]
-    if len(set(var_names)) < len(var_names):
-        var_names = [f"result_{i}_{ f.__name__}" for i, f in enumerate(funcs)]
-
+    var_names = _get_varnames(funcs)
     first, *rest = funcs
     first_call = _write_call(first, assign=var_names[0], except_=except_)
     extra_imports, suppress = _exception_string(except_)
@@ -1723,27 +1746,20 @@ def maker(
         maker(
             "associative",
             "abc",
+            _write_call(func, "a", _write_call(func, "b", "c"), assign="left"),
             _write_call(
                 func,
-                "a",
-                _write_call(func, "b", "c", except_=Exception),
-                except_=Exception,
-                assign="left",
-            ),
-            _write_call(
-                func,
-                _write_call(func, "a", "b", except_=Exception),
+                _write_call(func, "a", "b"),
                 "c",
-                except_=Exception,
                 assign="right",
             ),
         )
     if commutative:
         maker(
             "commutative",
             "ab",
-            _write_call(func, "a", "b", except_=Exception, assign="left"),
-            _write_call(func, "b", "a", except_=Exception, assign="right"),
+            _write_call(func, "a", "b", assign="left"),
+            _write_call(func, "b", "a", assign="right"),
         )
     if identity is not None:
         # Guess that the identity element is the minimal example from our operands
@@ -1765,34 +1781,42 @@ def maker(
             compile(repr(identity), "<string>", "exec")
         except SyntaxError:
             identity = repr(identity)  # type: ignore
-        maker(
-            "identity",
-            "a",
+        identity_parts = [
+            f"{identity = }",
             _assert_eq(
                 style,
                 "a",
-                _write_call(func, "a", repr(identity), except_=Exception),
+                _write_call(func, "a", "identity"),
             ),
-        )
+            _assert_eq(
+                style,
+                "a",
+                _write_call(func, "identity", "a"),
+            ),
+        ]
+        maker("identity", "a", "\n".join(identity_parts))
     if distributes_over:
-        maker(
-            distributes_over.__name__ + "_distributes_over",
-            "abc",
+        do = distributes_over
+        dist_parts = [
+            _write_call(func, "a", _write_call(do, "b", "c"), assign="left"),
             _write_call(
-                distributes_over,
-                _write_call(func, "a", "b", except_=Exception),
-                _write_call(func, "a", "c", except_=Exception),
-                except_=Exception,
-                assign="left",
+                do,
+                _write_call(func, "a", "b"),
+                _write_call(func, "a", "c"),
+                assign="ldist",
             ),
+            _assert_eq(style, "ldist", "left"),
+            "\n",
+            _write_call(func, _write_call(do, "a", "b"), "c", assign="right"),
             _write_call(
-                func,
-                "a",
-                _write_call(distributes_over, "b", "c", except_=Exception),
-                except_=Exception,
-                assign="right",
+                do,
+                _write_call(func, "a", "c"),
+                _write_call(func, "b", "c"),
+                assign="rdist",
             ),
-        )
+            _assert_eq(style, "rdist", "right"),
+        ]
+        maker(do.__name__ + "_distributes_over", "abc", "\n".join(dist_parts))
 
     _, operands_repr = _valid_syntax_repr(operands)
     operands_repr = _st_strategy_names(operands_repr)

diff --git a/hypothesis-python/src/hypothesis/internal/conjecture/engine.py b/hypothesis-python/src/hypothesis/internal/conjecture/engine.py
@@ -47,6 +47,13 @@
 MIN_TEST_CALLS = 10
 BUFFER_SIZE = 8 * 1024
 
+# If the shrinking phase takes more than five minutes, abort it early and print
+# a warning.   Many CI systems will kill a build after around ten minutes with
+# no output, and appearing to hang isn't great for interactive use either -
+# showing partially-shrunk examples is better than quitting with no examples!
+# (but make it monkeypatchable, for the rare users who need to keep on shrinking)
+MAX_SHRINKING_SECONDS = 300
+
 
 @attr.s
 class HealthCheckState:
@@ -934,12 +941,7 @@ def shrink_interesting_examples(self):
             return
 
         self.debug("Shrinking interesting examples")
-
-        # If the shrinking phase takes more than five minutes, abort it early and print
-        # a warning.   Many CI systems will kill a build after around ten minutes with
-        # no output, and appearing to hang isn't great for interactive use either -
-        # showing partially-shrunk examples is better than quitting with no examples!
-        self.finish_shrinking_deadline = time.perf_counter() + 300
+        self.finish_shrinking_deadline = time.perf_counter() + MAX_SHRINKING_SECONDS
 
         for prev_data in sorted(
             self.interesting_examples.values(), key=lambda d: sort_key(d.buffer)

diff --git a/hypothesis-python/tests/ghostwriter/recorded/addition_op_magic.txt b/hypothesis-python/tests/ghostwriter/recorded/addition_op_magic.txt
@@ -23,4 +23,6 @@ def test_commutative_binary_operation_add(a: float, b: float) -> None:
 
 @given(a=add_operands)
 def test_identity_binary_operation_add(a: float) -> None:
-    assert a == test_expected_output.add(a=a, b=0.0)
+    identity = 0.0
+    assert a == test_expected_output.add(a=a, b=identity)
+    assert a == test_expected_output.add(a=identity, b=a)
diff --git a/hypothesis-python/tests/ghostwriter/recorded/addition_op_multimagic.txt b/hypothesis-python/tests/ghostwriter/recorded/addition_op_multimagic.txt
@@ -1,16 +1,22 @@
 # This test code was written by the `hypothesis.extra.ghostwriter` module
 # and is provided under the Creative Commons Zero public domain dedication.
 
-import _operator
 import numpy
+import operator
 import test_expected_output
 from hypothesis import given, strategies as st
 
 
 @given(a=st.floats(), b=st.floats())
 def test_equivalent_add_add_add(a: float, b: float) -> None:
-    result_0_add = _operator.add(a, b)
-    result_1_add = numpy.add(a, b)
-    result_2_add = test_expected_output.add(a=a, b=b)
-    assert result_0_add == result_1_add, (result_0_add, result_1_add)
-    assert result_0_add == result_2_add, (result_0_add, result_2_add)
+    result_add_numpy = numpy.add(a, b)
+    result_add_operator = operator.add(a, b)
+    result_add_test_expected_output = test_expected_output.add(a=a, b=b)
+    assert result_add_numpy == result_add_operator, (
+        result_add_numpy,
+        result_add_operator,
+    )
+    assert result_add_numpy == result_add_test_expected_output, (
+        result_add_numpy,
+        result_add_test_expected_output,
+    )
diff --git a/hypothesis-python/tests/ghostwriter/recorded/division_binop_error_handler.txt b/hypothesis-python/tests/ghostwriter/recorded/division_binop_error_handler.txt
@@ -23,4 +23,6 @@ def test_commutative_binary_operation_divide(a: int, b: int) -> None:
 
 @given(a=divide_operands)
 def test_identity_binary_operation_divide(a: int) -> None:
-    assert a == test_expected_output.divide(a=a, b=1)
+    identity = 1
+    assert a == test_expected_output.divide(a=a, b=identity)
+    assert a == test_expected_output.divide(a=identity, b=a)
diff --git a/hypothesis-python/tests/ghostwriter/recorded/division_operator.txt b/hypothesis-python/tests/ghostwriter/recorded/division_operator.txt
@@ -1,7 +1,7 @@
 # This test code was written by the `hypothesis.extra.ghostwriter` module
 # and is provided under the Creative Commons Zero public domain dedication.
 
-import _operator
+import operator
 from hypothesis import given, strategies as st
 
 # TODO: replace st.nothing() with an appropriate strategy
@@ -11,4 +11,6 @@ truediv_operands = st.nothing()
 
 @given(a=truediv_operands)
 def test_identity_binary_operation_truediv(a):
-    assert a == _operator.truediv(a, "identity element here")
+    identity = "identity element here"
+    assert a == operator.truediv(a, identity)
+    assert a == operator.truediv(identity, a)
diff --git a/hypothesis-python/tests/ghostwriter/recorded/division_operator_with_annotations.txt b/hypothesis-python/tests/ghostwriter/recorded/division_operator_with_annotations.txt
@@ -1,7 +1,7 @@
 # This test code was written by the `hypothesis.extra.ghostwriter` module
 # and is provided under the Creative Commons Zero public domain dedication.
 
-import _operator
+import operator
 from hypothesis import given, strategies as st
 
 # TODO: replace st.nothing() with an appropriate strategy
@@ -11,4 +11,6 @@ truediv_operands = st.nothing()
 
 @given(a=truediv_operands)
 def test_identity_binary_operation_truediv(a) -> None:
-    assert a == _operator.truediv(a, "identity element here")
+    identity = "identity element here"
+    assert a == operator.truediv(a, identity)
+    assert a == operator.truediv(identity, a)
diff --git a/hypothesis-python/tests/ghostwriter/recorded/division_roundtrip_arithmeticerror_handler.txt b/hypothesis-python/tests/ghostwriter/recorded/division_roundtrip_arithmeticerror_handler.txt
@@ -1,7 +1,7 @@
 # This test code was written by the `hypothesis.extra.ghostwriter` module
 # and is provided under the Creative Commons Zero public domain dedication.
 
-import _operator
+import operator
 import test_expected_output
 from hypothesis import given, reject, strategies as st
 
@@ -10,7 +10,7 @@ from hypothesis import given, reject, strategies as st
 def test_roundtrip_divide_mul(a: int, b: int) -> None:
     try:
         value0 = test_expected_output.divide(a=a, b=b)
-        value1 = _operator.mul(value0, b)
+        value1 = operator.mul(value0, b)
     except ArithmeticError:
         reject()
     assert a == value1, (a, value1)
diff --git a/hypothesis-python/tests/ghostwriter/recorded/division_roundtrip_error_handler.txt b/hypothesis-python/tests/ghostwriter/recorded/division_roundtrip_error_handler.txt
@@ -1,7 +1,7 @@
 # This test code was written by the `hypothesis.extra.ghostwriter` module
 # and is provided under the Creative Commons Zero public domain dedication.
 
-import _operator
+import operator
 import test_expected_output
 from hypothesis import given, reject, strategies as st
 
@@ -12,5 +12,5 @@ def test_roundtrip_divide_mul(a: int, b: int) -> None:
         value0 = test_expected_output.divide(a=a, b=b)
     except ZeroDivisionError:
         reject()
-    value1 = _operator.mul(value0, b)
+    value1 = operator.mul(value0, b)
     assert a == value1, (a, value1)