Convert Location to a namedtuple, and associated cleanup (#205)

This change makes the `Location` dataclass, which does not change frequently, into a new `SourceLocation` namedtuple, and changes the `SourceLocation` serialization. As a result, with this change: * `embossc` runs about 25% faster on a large (7kLOC) input; `python3 -OO emboss` runs about 19% faster on the same input. * Serialized IR is about 45% smaller. Details: * Replace the `ir_data.Location` dataclass with a new `parser_types.SourceLocation` namedtuple. The rename helps clarify the difference between a location within source code (`SourceLocation`) and a location within a structure (`FieldLocation`). * Similarly, replace `ir_data.Position` with `parser_types.SourcePosition`. * Update any place that edits a `SourceLocation` with an appropriate assignment; e.g., `x.source_location.end = y` becomes `x.source_location = x.source_location._replace(end=y)`. In most cases, several fields were updated consecutively; those updates are been merged. * Update the JSON serialization to use the compact format. * Replace `format_location()` and `format_position()` with `__str__()` methods on `SourceLocation` and `SourcePosition`, respectively. * Replace `parse_location()` and `parse_position()` with `from_str()` class methods on `SourceLocation` and `SourcePosition`, respectively. * Move the `make_location()` functionality into `SourceLocation.__new__()`. * Update `_to_dict` and `_from_dict` in `IrDataSerializer` to stringify and destringify `SourceLocation`. It is tempting to try to do this during the JSON serialization step (with a `default=` parameter to `json.dumps` and an `object_hook=` parameter to `json.loads`), but it is tricky to get the `object_hook` to know when to convert.
google · Oct 17, 2024 · 73cbd98 · 73cbd98
1 parent 46423da
commit 73cbd98
Show file tree

Hide file tree

Showing 29 changed files with 780 additions and 1,523 deletions.
diff --git a/compiler/back_end/cpp/header_generator.py b/compiler/back_end/cpp/header_generator.py
@@ -1747,9 +1747,14 @@ def _offset_source_location_column(source_location, offset):
         original start column.
     """
 
-    new_location = ir_data_utils.copy(source_location)
-    new_location.start.column = source_location.start.column + offset[0]
-    new_location.end.column = source_location.start.column + offset[1]
+    new_location = source_location._replace(
+        start=source_location.start._replace(
+            column=source_location.start.column + offset[0]
+        ),
+        end=source_location.start._replace(
+            column=source_location.start.column + offset[1]
+        ),
+    )
 
     return new_location
 

diff --git a/compiler/back_end/cpp/header_generator_test.py b/compiler/back_end/cpp/header_generator_test.py
@@ -126,12 +126,11 @@ def test_rejects_bad_enum_case_at_start(self):
         )
         attr = ir.module[0].type[0].attribute[0]
 
-        bad_case_source_location = ir_data.Location()
-        bad_case_source_location = ir_data_utils.builder(bad_case_source_location)
-        bad_case_source_location.CopyFrom(attr.value.source_location)
-        # Location of SHORTY_CASE in the attribute line.
-        bad_case_source_location.start.column = 30
-        bad_case_source_location.end.column = 41
+        # SourceLocation of SHORTY_CASE in the attribute line.
+        bad_case_source_location = attr.value.source_location._replace(
+            start=attr.value.source_location.start._replace(column=30),
+            end=attr.value.source_location.end._replace(column=41),
+        )
 
         self.assertEqual(
             [
@@ -156,12 +155,11 @@ def test_rejects_bad_enum_case_in_middle(self):
         )
         attr = ir.module[0].type[0].attribute[0]
 
-        bad_case_source_location = ir_data.Location()
-        bad_case_source_location = ir_data_utils.builder(bad_case_source_location)
-        bad_case_source_location.CopyFrom(attr.value.source_location)
-        # Location of bad_CASE in the attribute line.
-        bad_case_source_location.start.column = 43
-        bad_case_source_location.end.column = 51
+        # SourceLocation of bad_CASE in the attribute line.
+        bad_case_source_location = attr.value.source_location._replace(
+            start=attr.value.source_location.start._replace(column=43),
+            end=attr.value.source_location.end._replace(column=51),
+        )
 
         self.assertEqual(
             [
@@ -186,12 +184,11 @@ def test_rejects_bad_enum_case_at_end(self):
         )
         attr = ir.module[0].type[0].attribute[0]
 
-        bad_case_source_location = ir_data.Location()
-        bad_case_source_location = ir_data_utils.builder(bad_case_source_location)
-        bad_case_source_location.CopyFrom(attr.value.source_location)
-        # Location of BAD_case in the attribute line.
-        bad_case_source_location.start.column = 55
-        bad_case_source_location.end.column = 63
+        # SourceLocation of BAD_case in the attribute line.
+        bad_case_source_location = attr.value.source_location._replace(
+            start=attr.value.source_location.start._replace(column=55),
+            end=attr.value.source_location.end._replace(column=63),
+        )
 
         self.assertEqual(
             [
@@ -216,12 +213,11 @@ def test_rejects_duplicate_enum_case(self):
         )
         attr = ir.module[0].type[0].attribute[0]
 
-        bad_case_source_location = ir_data.Location()
-        bad_case_source_location = ir_data_utils.builder(bad_case_source_location)
-        bad_case_source_location.CopyFrom(attr.value.source_location)
-        # Location of the second SHOUTY_CASE in the attribute line.
-        bad_case_source_location.start.column = 43
-        bad_case_source_location.end.column = 54
+        # SourceLocation of the second SHOUTY_CASE in the attribute line.
+        bad_case_source_location = attr.value.source_location._replace(
+            start=attr.value.source_location.start._replace(column=43),
+            end=attr.value.source_location.end._replace(column=54),
+        )
 
         self.assertEqual(
             [
@@ -246,12 +242,11 @@ def test_rejects_empty_enum_case(self):
         )
         attr = ir.module[0].type[0].attribute[0]
 
-        bad_case_source_location = ir_data.Location()
-        bad_case_source_location = ir_data_utils.builder(bad_case_source_location)
-        bad_case_source_location.CopyFrom(attr.value.source_location)
-        # Location of excess comma.
-        bad_case_source_location.start.column = 42
-        bad_case_source_location.end.column = 42
+        # SourceLocation of excess comma.
+        bad_case_source_location = attr.value.source_location._replace(
+            start=attr.value.source_location.start._replace(column=42),
+            end=attr.value.source_location.end._replace(column=42),
+        )
 
         self.assertEqual(
             [
@@ -274,8 +269,10 @@ def test_rejects_empty_enum_case(self):
             "  BAZ = 2\n"
         )
 
-        bad_case_source_location.start.column = 30
-        bad_case_source_location.end.column = 30
+        bad_case_source_location = attr.value.source_location._replace(
+            start=attr.value.source_location.start._replace(column=30),
+            end=attr.value.source_location.end._replace(column=30),
+        )
 
         self.assertEqual(
             [
@@ -298,8 +295,10 @@ def test_rejects_empty_enum_case(self):
             "  BAZ = 2\n"
         )
 
-        bad_case_source_location.start.column = 54
-        bad_case_source_location.end.column = 54
+        bad_case_source_location = attr.value.source_location._replace(
+            start=attr.value.source_location.start._replace(column=54),
+            end=attr.value.source_location.end._replace(column=54),
+        )
 
         self.assertEqual(
             [
@@ -322,8 +321,10 @@ def test_rejects_empty_enum_case(self):
             "  BAZ = 2\n"
         )
 
-        bad_case_source_location.start.column = 45
-        bad_case_source_location.end.column = 45
+        bad_case_source_location = attr.value.source_location._replace(
+            start=attr.value.source_location.start._replace(column=45),
+            end=attr.value.source_location.end._replace(column=45),
+        )
 
         self.assertEqual(
             [
@@ -346,8 +347,10 @@ def test_rejects_empty_enum_case(self):
             "  BAZ = 2\n"
         )
 
-        bad_case_source_location.start.column = 30
-        bad_case_source_location.end.column = 30
+        bad_case_source_location = attr.value.source_location._replace(
+            start=attr.value.source_location.start._replace(column=30),
+            end=attr.value.source_location.end._replace(column=30),
+        )
 
         self.assertEqual(
             [
@@ -370,8 +373,10 @@ def test_rejects_empty_enum_case(self):
             "  BAZ = 2\n"
         )
 
-        bad_case_source_location.start.column = 35
-        bad_case_source_location.end.column = 35
+        bad_case_source_location = attr.value.source_location._replace(
+            start=attr.value.source_location.start._replace(column=35),
+            end=attr.value.source_location.end._replace(column=35),
+        )
 
         self.assertEqual(
             [
@@ -394,8 +399,10 @@ def test_rejects_empty_enum_case(self):
             "  BAZ = 2\n"
         )
 
-        bad_case_source_location.start.column = 31
-        bad_case_source_location.end.column = 31
+        bad_case_source_location = attr.value.source_location._replace(
+            start=attr.value.source_location.start._replace(column=31),
+            end=attr.value.source_location.end._replace(column=31),
+        )
 
         self.assertEqual(
             [

diff --git a/compiler/front_end/glue.py b/compiler/front_end/glue.py
@@ -203,7 +203,7 @@ def parse_module(file_name, file_reader):
     """
     source_code, errors = file_reader(file_name)
     if errors:
-        location = parser_types.make_location((1, 1), (1, 1))
+        location = parser_types.SourceLocation((1, 1), (1, 1))
         return (
             None,
             None,

diff --git a/compiler/front_end/glue_test.py b/compiler/front_end/glue_test.py
@@ -24,7 +24,7 @@
 from compiler.util import parser_types
 from compiler.util import test_util
 
-_location = parser_types.make_location
+_location = parser_types.SourceLocation
 
 _ROOT_PACKAGE = "testdata.golden"
 _GOLDEN_PATH = ""
@@ -232,7 +232,9 @@ def test_synthetic_error(self):
         self.assertFalse(errors)
         # Artificially mark the first field as is_synthetic.
         first_field = ir.module[0].type[0].structure.field[0]
-        first_field.source_location.is_synthetic = True
+        first_field.source_location = first_field.source_location._replace(
+            is_synthetic=True
+        )
         ir, errors = glue.process_ir(ir, None)
         self.assertTrue(errors)
         self.assertEqual(
@@ -259,8 +261,12 @@ def test_suppressed_synthetic_error(self):
         self.assertFalse(errors)
         # Artificially mark the name of the second field as is_synthetic.
         second_field = ir.module[0].type[0].structure.field[1]
-        second_field.name.source_location.is_synthetic = True
-        second_field.name.name.source_location.is_synthetic = True
+        second_field.name.source_location = second_field.name.source_location._replace(
+            is_synthetic=True
+        )
+        second_field.name.name.source_location = (
+            second_field.name.name.source_location._replace(is_synthetic=True)
+        )
         ir, errors = glue.process_ir(ir, None)
         self.assertEqual(1, len(errors))
         self.assertEqual("Duplicate name 'field'", errors[0][0].message)

diff --git a/compiler/front_end/lr1.py b/compiler/front_end/lr1.py
@@ -140,7 +140,9 @@ def __str__(self):
 
 # ANY_TOKEN is used by mark_error as a "wildcard" token that should be replaced
 # by every other token.
-ANY_TOKEN = parser_types.Token(object(), "*", parser_types.parse_location("0:0-0:0"))
+ANY_TOKEN = parser_types.Token(
+    object(), "*", parser_types.SourceLocation.from_str("0:0-0:0")
+)
 
 
 class Reduction(
@@ -690,26 +692,7 @@ def state():
                 # children, setting the source_location to None in that case.
                 start_position = None
                 end_position = None
-                for child in children:
-                    if (
-                        hasattr(child, "source_location")
-                        and child.source_location is not None
-                    ):
-                        start_position = child.source_location.start
-                        break
-                for child in reversed(children):
-                    if (
-                        hasattr(child, "source_location")
-                        and child.source_location is not None
-                    ):
-                        end_position = child.source_location.end
-                        break
-                if start_position is None:
-                    source_location = None
-                else:
-                    source_location = parser_types.make_location(
-                        start_position, end_position
-                    )
+                source_location = parser_types.merge_source_locations(*children)
                 reduction = Reduction(
                     next_action.rule.lhs, children, next_action.rule, source_location
                 )

diff --git a/compiler/front_end/lr1_test.py b/compiler/front_end/lr1_test.py
@@ -34,7 +34,7 @@ def _tokenize(text):
     result = []
     for i in range(len(text)):
         result.append(
-            Token(text[i], parser_types.make_location((1, i + 1), (1, i + 2)))
+            Token(text[i], parser_types.SourceLocation((1, i + 1), (1, i + 2)))
         )
     return result
 
@@ -209,7 +209,7 @@ def test_goto_table(self):
 
     def test_successful_parse(self):
         parser = _alsu_grammar.parser()
-        loc = parser_types.parse_location
+        loc = parser_types.SourceLocation.from_str
         s_to_c_c = parser_types.Production.parse("S -> C C")
         c_to_c_c = parser_types.Production.parse("C -> c C")
         c_to_d = parser_types.Production.parse("C -> d")