Handle very large .emb files. (#215)

This change switches parse tree handling to use iteration (with an explicit stack) instead of recursion, which: * Allows large (>~1000 entity) `.emb` files to be formatted. * Allows very large (>~16k entity) `.emb` files to be compiled. The difference in sizes in the previous code was due to `module_ir.py` hackily increasing the recursion limit: while this more or less worked, it was a little dangerous (it ran the risk of blowing out the C stack, depending on platform) and only increased the limit. This change removes the limit entirely (at least, up to the available memory on the system).
google · Dec 21, 2024 · 1827594 · 1827594
1 parent 615a955
commit 1827594
Show file tree

Hide file tree

Showing 7 changed files with 259 additions and 93 deletions.
diff --git a/compiler/front_end/BUILD b/compiler/front_end/BUILD
@@ -73,6 +73,7 @@ py_library(
         "//compiler/util:ir_data",
         "//compiler/util:name_conversion",
         "//compiler/util:parser_types",
+        "//compiler/util:parser_util",
     ],
 )
 
@@ -436,6 +437,7 @@ py_library(
         ":module_ir",
         ":tokenizer",
         "//compiler/util:parser_types",
+        "//compiler/util:parser_util",
     ],
 )
 

diff --git a/compiler/front_end/format_emb.py b/compiler/front_end/format_emb.py
@@ -26,6 +26,7 @@
 from compiler.front_end import module_ir
 from compiler.front_end import tokenizer
 from compiler.util import parser_types
+from compiler.util import parser_util
 
 
 class Config(collections.namedtuple("Config", ["indent_width", "show_line_types"])):
@@ -67,18 +68,17 @@ def format_emboss_parse_tree(parse_tree, config, used_productions=None):
     Returns:
         A string of the reformatted source text.
     """
-    if hasattr(parse_tree, "children"):
-        parsed_children = [
-            format_emboss_parse_tree(child, config, used_productions)
-            for child in parse_tree.children
-        ]
-        args = parsed_children + [config]
-        if used_productions is not None:
-            used_productions.add(parse_tree.production)
-        return _formatters[parse_tree.production](*args)
-    else:
-        assert isinstance(parse_tree, parser_types.Token), str(parse_tree)
-        return parse_tree.text
+    formatters = {}
+    for production, handler in _formatters.items():
+        # An extra layer of indirection is required here so that the resulting
+        # lambda does not capture the local variable `handler`.
+        def wrapped_handler(handler):
+            return lambda _, *args: handler(*(args + (config,)))
+
+        formatters[production] = wrapped_handler(handler)
+    return parser_util.transform_parse_tree(
+        parse_tree, lambda n: n.text, formatters, used_productions
+    )
 
 
 def sanity_check_format_result(formatted_text, original_text):

diff --git a/compiler/front_end/format_emb_test.py b/compiler/front_end/format_emb_test.py
@@ -92,7 +92,21 @@ def test_eol_missing(self):
 
 
 class FormatEmbTest(unittest.TestCase):
-    pass
+
+    def test_very_long_emb(self):
+        """Checks that very long inputs do not hit the Python recursion limit."""
+        emb = ["enum Test:\n"]
+        # Enough entities to blow through the default recursion limit and the
+        # bumped limit that was previously in place.
+        for i in range(max(sys.getrecursionlimit(), 16 * 1024) * 2):
+            emb.append(f"  VALUE_{i} = {i}\n")
+        parsed_unformatted = parser.parse_module(
+            tokenizer.tokenize("".join(emb), "long.emb")[0]
+        )
+        formatted_text = format_emb.format_emboss_parse_tree(
+            parsed_unformatted.parse_tree,
+            format_emb.Config(indent_width=2),
+        )
 
 
 def _make_golden_file_tests():

diff --git a/compiler/front_end/module_ir.py b/compiler/front_end/module_ir.py
@@ -29,6 +29,7 @@
 from compiler.util import ir_data_utils
 from compiler.util import name_conversion
 from compiler.util import parser_types
+from compiler.util import parser_util
 
 
 # Intermediate types; should not be found in the final IR.
@@ -82,88 +83,78 @@ def __init__(self, field, subtypes=None):
 def build_ir(parse_tree, used_productions=None):
     r"""Builds a module-level intermediate representation from a valid parse tree.
 
-  The parse tree is precisely dictated by the exact productions in the grammar
-  used by the parser, with no semantic information.  _really_build_ir transforms
-  this "raw" form into a stable, cooked representation, thereby isolating
-  subsequent steps from the exact details of the grammar.
-
-  (Probably incomplete) list of transformations:
-
-  *   ParseResult and Token nodes are replaced with Module, Attribute, Struct,
-      Type, etc. objects.
-
-  *   Purely syntactic tokens ('"["', '"struct"', etc.) are discarded.
-
-  *   Repeated elements are transformed from tree form to list form:
-
-          a*
-         / \
-        b   a*
+    The parse tree is precisely dictated by the exact productions in the grammar
+    used by the parser, with no semantic information.  _really_build_ir
+    transforms this "raw" form into a stable, cooked representation, thereby
+    isolating subsequent steps from the exact details of the grammar.
+    
+    (Probably incomplete) list of transformations:
+    
+    *   ParseResult and Token nodes are replaced with Module, Attribute, Struct,
+        Type, etc. objects.
+    
+    *   Purely syntactic tokens ('"["', '"struct"', etc.) are discarded.
+    
+    *   Repeated elements are transformed from tree form to list form:
+    
+            a*
            / \
-          c   a*
+          b   a*
              / \
-            d   a*
-
-      (where b, c, and d are nodes of type "a") becomes [b, c, d].
-
-  *   The values of numeric constants (Number, etc. tokens) are parsed.
-
-  *   Different classes of names (snake_names, CamelNames, ShoutyNames) are
-      folded into a single "Name" type, since they are guaranteed to appear in
-      the correct places in the parse tree.
-
-
-  Arguments:
-    parse_tree: A parse tree.  Each leaf node should be a parser_types.Token
-      object, and each non-leaf node should have a 'symbol' attribute specifying
-      which grammar symbol it represents, and a 'children' attribute containing
-      a list of child nodes.  This is the format returned by the parsers
-      produced by the lr1 module, when run against tokens from the tokenizer
-      module.
-    used_productions: If specified, used_productions.add() will be called with
-      each production actually used in parsing.  This can be useful when
-      developing the grammar and writing tests; in particular, it can be used to
-      figure out which productions are *not* used when parsing a particular
-      file.
-
-  Returns:
-    A module-level intermediate representation (module IR) for an Emboss module
-    (source file).  This IR will not have symbols resolved; that must be done on
-    a forest of module IRs so that names from other modules can be resolved.
-  """
-
-    # TODO(b/140259131): Refactor _really_build_ir to be less recursive/use an
-    # explicit stack.
-    old_recursion_limit = sys.getrecursionlimit()
-    sys.setrecursionlimit(16 * 1024)  # ~8000 top-level entities in one module.
-    try:
-        result = _really_build_ir(parse_tree, used_productions)
-    finally:
-        sys.setrecursionlimit(old_recursion_limit)
-    return result
-
-
-def _really_build_ir(parse_tree, used_productions):
-    """Real implementation of build_ir()."""
-    if used_productions is None:
-        used_productions = set()
-    if hasattr(parse_tree, "children"):
-        parsed_children = [
-            _really_build_ir(child, used_productions) for child in parse_tree.children
-        ]
-        used_productions.add(parse_tree.production)
-        result = _handlers[parse_tree.production](*parsed_children)
-        if parse_tree.source_location:
-            if isinstance(result, tuple):
-                result = result._replace(source_location=parse_tree.source_location)
-            else:
-                result.source_location = parse_tree.source_location
-        return result
-    else:
-        # For leaf nodes, the temporary "IR" is just the token.  Higher-level rules
-        # will translate it to a real IR.
-        assert isinstance(parse_tree, parser_types.Token), str(parse_tree)
-        return parse_tree
+            c   a*
+               / \
+              d   a*
+    
+        (where b, c, and d are nodes of type "a") becomes [b, c, d].
+    
+    *   The values of numeric constants (Number, etc. tokens) are parsed.
+    
+    *   Different classes of names (snake_names, CamelNames, ShoutyNames) are
+        folded into a single "Name" type, since they are guaranteed to appear in
+        the correct places in the parse tree.
+    
+    
+    Arguments:
+        parse_tree: A parse tree.  Each leaf node should be a parser_types.Token
+            object, and each non-leaf node should have a 'symbol' attribute
+            specifying which grammar symbol it represents, and a 'children'
+            attribute containing a list of child nodes.  This is the format
+            returned by the parsers produced by the lr1 module, when run
+            against tokens from the tokenizer module.
+        used_productions: If specified, used_productions.add() will be called
+            with each production actually used in parsing.  This can be useful
+            when developing the grammar and writing tests; in particular, it
+            can be used to figure out which productions are *not* used when
+            parsing a particular file.
+    
+    Returns:
+        A module-level intermediate representation (module IR) for an Emboss
+        module (source file).  This IR will not have symbols resolved,
+        constraints checked, fields synthesized, etc.; it will only be a
+        representation of the syntactic elements of the source.
+    """
+    handlers = {}
+    for production, handler in _handlers.items():
+        # An extra layer of indirection is required here so that the resulting
+        # lambda does not capture the local variable `handler`.
+        def wrapped_handler(handler):
+            def wrapped_handler(node, *args):
+                module_node = handler(*args)
+                if node.source_location:
+                    if isinstance(module_node, tuple):
+                        module_node = module_node._replace(
+                            source_location=node.source_location
+                        )
+                    else:
+                        module_node.source_location = node.source_location
+                return module_node
+
+            return wrapped_handler
+
+        handlers[production] = wrapped_handler(handler)
+    return parser_util.transform_parse_tree(
+        parse_tree, lambda n: n, handlers, used_productions
+    )
 
 
 # Map of productions to their handlers.

diff --git a/compiler/front_end/module_ir_test.py b/compiler/front_end/module_ir_test.py
@@ -18,6 +18,7 @@
 
 import collections
 import pkgutil
+import sys
 import unittest
 
 from compiler.front_end import module_ir
@@ -4057,6 +4058,18 @@ def test_double_negative_non_compilation(self):
             parse_result = parser.parse_module(tokenizer.tokenize(example, "")[0])
             self.assertFalse(parse_result.error)
 
+    def test_long_input(self):
+        """Checks that very long inputs do not hit the Python recursion limit."""
+        emb = ["enum Test:\n"]
+        # Enough entities to blow through the default recursion limit and the
+        # bumped limit that was previously in place.
+        for i in range(max(sys.getrecursionlimit(), 16 * 1024) * 2):
+            emb.append(f"  VALUE_{i} = {i}\n")
+        parse_result = parser.parse_module(
+            tokenizer.tokenize("".join(emb), "long.emb")[0]
+        )
+        module_ir.build_ir(parse_result.parse_tree)
+
 
 def _make_superset_tests():
 

diff --git a/compiler/util/BUILD b/compiler/util/BUILD
@@ -154,6 +154,14 @@ py_test(
     ],
 )
 
+py_library(
+    name = "parser_util",
+    srcs = ["parser_util.py"],
+    deps = [
+        ":parser_types",
+    ],
+)
+
 py_library(
     name = "error",
     srcs = [