Skip to content

Commit

Permalink
Handle very large .emb files. (#215)
Browse files Browse the repository at this point in the history
This change switches parse tree handling to use iteration (with an
explicit stack) instead of recursion, which:

*   Allows large (>~1000 entity) `.emb` files to be formatted.
*   Allows very large (>~16k entity) `.emb` files to be compiled.

The difference in sizes in the previous code was due to `module_ir.py`
hackily increasing the recursion limit: while this more or less worked,
it was a little dangerous (it ran the risk of blowing out the C stack,
depending on platform) and only increased the limit.  This change
removes the limit entirely (at least, up to the available memory on the
system).
  • Loading branch information
reventlov authored Dec 21, 2024
1 parent 615a955 commit 1827594
Show file tree
Hide file tree
Showing 7 changed files with 259 additions and 93 deletions.
2 changes: 2 additions & 0 deletions compiler/front_end/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ py_library(
"//compiler/util:ir_data",
"//compiler/util:name_conversion",
"//compiler/util:parser_types",
"//compiler/util:parser_util",
],
)

Expand Down Expand Up @@ -436,6 +437,7 @@ py_library(
":module_ir",
":tokenizer",
"//compiler/util:parser_types",
"//compiler/util:parser_util",
],
)

Expand Down
24 changes: 12 additions & 12 deletions compiler/front_end/format_emb.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from compiler.front_end import module_ir
from compiler.front_end import tokenizer
from compiler.util import parser_types
from compiler.util import parser_util


class Config(collections.namedtuple("Config", ["indent_width", "show_line_types"])):
Expand Down Expand Up @@ -67,18 +68,17 @@ def format_emboss_parse_tree(parse_tree, config, used_productions=None):
Returns:
A string of the reformatted source text.
"""
if hasattr(parse_tree, "children"):
parsed_children = [
format_emboss_parse_tree(child, config, used_productions)
for child in parse_tree.children
]
args = parsed_children + [config]
if used_productions is not None:
used_productions.add(parse_tree.production)
return _formatters[parse_tree.production](*args)
else:
assert isinstance(parse_tree, parser_types.Token), str(parse_tree)
return parse_tree.text
formatters = {}
for production, handler in _formatters.items():
# An extra layer of indirection is required here so that the resulting
# lambda does not capture the local variable `handler`.
def wrapped_handler(handler):
return lambda _, *args: handler(*(args + (config,)))

formatters[production] = wrapped_handler(handler)
return parser_util.transform_parse_tree(
parse_tree, lambda n: n.text, formatters, used_productions
)


def sanity_check_format_result(formatted_text, original_text):
Expand Down
16 changes: 15 additions & 1 deletion compiler/front_end/format_emb_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,21 @@ def test_eol_missing(self):


class FormatEmbTest(unittest.TestCase):
pass

def test_very_long_emb(self):
"""Checks that very long inputs do not hit the Python recursion limit."""
emb = ["enum Test:\n"]
# Enough entities to blow through the default recursion limit and the
# bumped limit that was previously in place.
for i in range(max(sys.getrecursionlimit(), 16 * 1024) * 2):
emb.append(f" VALUE_{i} = {i}\n")
parsed_unformatted = parser.parse_module(
tokenizer.tokenize("".join(emb), "long.emb")[0]
)
formatted_text = format_emb.format_emboss_parse_tree(
parsed_unformatted.parse_tree,
format_emb.Config(indent_width=2),
)


def _make_golden_file_tests():
Expand Down
151 changes: 71 additions & 80 deletions compiler/front_end/module_ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from compiler.util import ir_data_utils
from compiler.util import name_conversion
from compiler.util import parser_types
from compiler.util import parser_util


# Intermediate types; should not be found in the final IR.
Expand Down Expand Up @@ -82,88 +83,78 @@ def __init__(self, field, subtypes=None):
def build_ir(parse_tree, used_productions=None):
r"""Builds a module-level intermediate representation from a valid parse tree.
The parse tree is precisely dictated by the exact productions in the grammar
used by the parser, with no semantic information. _really_build_ir transforms
this "raw" form into a stable, cooked representation, thereby isolating
subsequent steps from the exact details of the grammar.
(Probably incomplete) list of transformations:
* ParseResult and Token nodes are replaced with Module, Attribute, Struct,
Type, etc. objects.
* Purely syntactic tokens ('"["', '"struct"', etc.) are discarded.
* Repeated elements are transformed from tree form to list form:
a*
/ \
b a*
The parse tree is precisely dictated by the exact productions in the grammar
used by the parser, with no semantic information. _really_build_ir
transforms this "raw" form into a stable, cooked representation, thereby
isolating subsequent steps from the exact details of the grammar.
(Probably incomplete) list of transformations:
* ParseResult and Token nodes are replaced with Module, Attribute, Struct,
Type, etc. objects.
* Purely syntactic tokens ('"["', '"struct"', etc.) are discarded.
* Repeated elements are transformed from tree form to list form:
a*
/ \
c a*
b a*
/ \
d a*
(where b, c, and d are nodes of type "a") becomes [b, c, d].
* The values of numeric constants (Number, etc. tokens) are parsed.
* Different classes of names (snake_names, CamelNames, ShoutyNames) are
folded into a single "Name" type, since they are guaranteed to appear in
the correct places in the parse tree.
Arguments:
parse_tree: A parse tree. Each leaf node should be a parser_types.Token
object, and each non-leaf node should have a 'symbol' attribute specifying
which grammar symbol it represents, and a 'children' attribute containing
a list of child nodes. This is the format returned by the parsers
produced by the lr1 module, when run against tokens from the tokenizer
module.
used_productions: If specified, used_productions.add() will be called with
each production actually used in parsing. This can be useful when
developing the grammar and writing tests; in particular, it can be used to
figure out which productions are *not* used when parsing a particular
file.
Returns:
A module-level intermediate representation (module IR) for an Emboss module
(source file). This IR will not have symbols resolved; that must be done on
a forest of module IRs so that names from other modules can be resolved.
"""

# TODO(b/140259131): Refactor _really_build_ir to be less recursive/use an
# explicit stack.
old_recursion_limit = sys.getrecursionlimit()
sys.setrecursionlimit(16 * 1024) # ~8000 top-level entities in one module.
try:
result = _really_build_ir(parse_tree, used_productions)
finally:
sys.setrecursionlimit(old_recursion_limit)
return result


def _really_build_ir(parse_tree, used_productions):
"""Real implementation of build_ir()."""
if used_productions is None:
used_productions = set()
if hasattr(parse_tree, "children"):
parsed_children = [
_really_build_ir(child, used_productions) for child in parse_tree.children
]
used_productions.add(parse_tree.production)
result = _handlers[parse_tree.production](*parsed_children)
if parse_tree.source_location:
if isinstance(result, tuple):
result = result._replace(source_location=parse_tree.source_location)
else:
result.source_location = parse_tree.source_location
return result
else:
# For leaf nodes, the temporary "IR" is just the token. Higher-level rules
# will translate it to a real IR.
assert isinstance(parse_tree, parser_types.Token), str(parse_tree)
return parse_tree
c a*
/ \
d a*
(where b, c, and d are nodes of type "a") becomes [b, c, d].
* The values of numeric constants (Number, etc. tokens) are parsed.
* Different classes of names (snake_names, CamelNames, ShoutyNames) are
folded into a single "Name" type, since they are guaranteed to appear in
the correct places in the parse tree.
Arguments:
parse_tree: A parse tree. Each leaf node should be a parser_types.Token
object, and each non-leaf node should have a 'symbol' attribute
specifying which grammar symbol it represents, and a 'children'
attribute containing a list of child nodes. This is the format
returned by the parsers produced by the lr1 module, when run
against tokens from the tokenizer module.
used_productions: If specified, used_productions.add() will be called
with each production actually used in parsing. This can be useful
when developing the grammar and writing tests; in particular, it
can be used to figure out which productions are *not* used when
parsing a particular file.
Returns:
A module-level intermediate representation (module IR) for an Emboss
module (source file). This IR will not have symbols resolved,
constraints checked, fields synthesized, etc.; it will only be a
representation of the syntactic elements of the source.
"""
handlers = {}
for production, handler in _handlers.items():
# An extra layer of indirection is required here so that the resulting
# lambda does not capture the local variable `handler`.
def wrapped_handler(handler):
def wrapped_handler(node, *args):
module_node = handler(*args)
if node.source_location:
if isinstance(module_node, tuple):
module_node = module_node._replace(
source_location=node.source_location
)
else:
module_node.source_location = node.source_location
return module_node

return wrapped_handler

handlers[production] = wrapped_handler(handler)
return parser_util.transform_parse_tree(
parse_tree, lambda n: n, handlers, used_productions
)


# Map of productions to their handlers.
Expand Down
13 changes: 13 additions & 0 deletions compiler/front_end/module_ir_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import collections
import pkgutil
import sys
import unittest

from compiler.front_end import module_ir
Expand Down Expand Up @@ -4057,6 +4058,18 @@ def test_double_negative_non_compilation(self):
parse_result = parser.parse_module(tokenizer.tokenize(example, "")[0])
self.assertFalse(parse_result.error)

def test_long_input(self):
"""Checks that very long inputs do not hit the Python recursion limit."""
emb = ["enum Test:\n"]
# Enough entities to blow through the default recursion limit and the
# bumped limit that was previously in place.
for i in range(max(sys.getrecursionlimit(), 16 * 1024) * 2):
emb.append(f" VALUE_{i} = {i}\n")
parse_result = parser.parse_module(
tokenizer.tokenize("".join(emb), "long.emb")[0]
)
module_ir.build_ir(parse_result.parse_tree)


def _make_superset_tests():

Expand Down
8 changes: 8 additions & 0 deletions compiler/util/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,14 @@ py_test(
],
)

py_library(
name = "parser_util",
srcs = ["parser_util.py"],
deps = [
":parser_types",
],
)

py_library(
name = "error",
srcs = [
Expand Down
Loading

0 comments on commit 1827594

Please sign in to comment.