From 7e4f8f4b8dafbfbb90ca3f9638fa31024c4eb2db Mon Sep 17 00:00:00 2001
From: Vincent Fazio <vfazio@gmail.com>
Date: Sun, 29 Dec 2024 09:01:20 -0600
Subject: [PATCH] Make {eol_}comments_re read-only and non-init arguments in
 `ParserConfig` (#352)

* [buffering] drop forced multiline match for string patterns

Previously, when scanning for matches to a regex, if the type of the
pattern was `str`, the pattern was always compiled with `re.MULTILINE`.

Recent changes to `ParserConfig` [0] changed the type used for regex
matches in generated code from `str` to `re.Pattern` which could lead to
a difference in behavior from previous versions where a defined comments
or eol_comments may have been implicitly relying on the `re.MULTILINE`
flag.

After discussion [1], it has been determined that usage of `re` flags
within TatSu should be deprecated in favor of users specifying the
necessary flags within patterns.

As such, drop the `re.MULTILINE` flag for strings compiled on the fly.

[0]: https://github.com/neogeny/TatSu/pull/338
[1]: https://github.com/neogeny/TatSu/issues/351#issuecomment-2563635784

* [grammar] make eol_comments multiline match

Make the default eol_comments regex use multiline matching.

Recent changes to `ParserConfig` [0] now use a precompiled regex (an
`re.Pattern`) instead of compiling the `str` regex on the fly.

The `Tokenizer` previously assumed `str` type regexes should all be
`re.MULTILINE` regardless of options defined in the regex itself when
compiling the pattern. This behavior has since changed to no longer
automatically apply and thus requires configurations to specify the
option in the pattern.

[0]: https://github.com/neogeny/TatSu/pull/338

* [infos] make {eol_}comments_re read-only attributes

Previously, the `eol_comments_re` and `comments_re` attributes were
public init arguments, were modifiable, and could thus become out of
sync with the `eol_comments` and `comments` attributes.

Also, with recent changes to `ParserConfig` [0], there were two ways to
initialize the regex values for comments and eol_comments directives;
either via the constructor using the *_re variables or by using the
sister string arguments and relying on `__post_init__` to compile the
values which trumped the explicit *_re argument values.

Now, the constructor interface has been simplified to not take either
`eol_comments_re` or `comments_re` as arguments. Callers may only use
`eol_comments` and `comments`.

The `eol_comments_re` and `comments_re` attributes are still
public, but are read-only so they are always a reflection of their
sister string values passed into the constructor.

[0]: https://github.com/neogeny/TatSu/pull/200

* [codegen] migrate to {eol_}comments

* [ngcodegen] migrate to {eol_}comments

* [bootstrap] migrate to {eol_}comments

* [lint] resolve errors

* [docs] note {eol_}comments directive behavior changes

* [docs] update syntax to reflect {eol_}comments arguments

* [test] fix test_parse_hash to use eol_comments

* [test] explicitly use multiline match in test_patterns_with_newlines
---
 docs/directives.rst             |  4 ++++
 docs/syntax.rst                 |  8 ++++----
 grammar/tatsu.ebnf              |  2 +-
 tatsu/bootstrap.py              |  8 ++++----
 tatsu/buffering.py              |  2 +-
 tatsu/codegen/objectmodel.py    |  6 +++---
 tatsu/codegen/python.py         | 16 ++++++++--------
 tatsu/g2e/semantics.py          |  2 +-
 tatsu/grammars.py               |  2 +-
 tatsu/infos.py                  | 34 ++++++++++++++++++++++++++-------
 tatsu/ngcodegen/python.py       |  4 ++--
 tatsu/util/_common.py           |  2 +-
 tatsu/walkers.py                |  2 +-
 test/grammar/pattern_test.py    |  2 +-
 test/grammar/syntax_test.py     |  2 +-
 test/parser_equivalence_test.py |  7 +++----
 16 files changed, 63 insertions(+), 40 deletions(-)

diff --git a/docs/directives.rst b/docs/directives.rst
index 82852984..4a765dfe 100644
--- a/docs/directives.rst
+++ b/docs/directives.rst
@@ -29,6 +29,8 @@ Specifies a regular expression to identify and exclude inline (bracketed) commen
 
     @@comments :: /\(\*((?:.|\n)*?)\*\)/
 
+.. note::
+   Prior to 5.12.1, comments implicitly had the `(?m) <https://docs.python.org/3/library/re.html#re.MULTILINE>`_ option defined. This is no longer the case.
 
 ``@@eol_comments :: <regexp>``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -39,6 +41,8 @@ Specifies a regular expression to identify and exclude end-of-line comments befo
 
     @@eol_comments :: /#([^\n]*?)$/
 
+.. note::
+   Prior to 5.12.1, eol_comments implicitly had the `(?m) <https://docs.python.org/3/library/re.html#re.MULTILINE>`_ option defined. This is no longer the case.
 
 ``@@ignorecase :: <bool>``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/syntax.rst b/docs/syntax.rst
index 1a5ad10c..b1dcc603 100644
--- a/docs/syntax.rst
+++ b/docs/syntax.rst
@@ -735,11 +735,11 @@ Comments
 ~~~~~~~~
 
 Parsers will skip over comments specified as a regular expression using
-the ``comments_re`` parameter:
+the ``comments`` parameter:
 
 .. code:: python
 
-   parser = MyParser(text, comments_re="\(\*.*?\*\)")
+   parser = MyParser(text, comments="\(\*.*?\*\)")
 
 For more complex comment handling, you can override the
 ``Buffer.eat_comments()`` method.
@@ -751,8 +751,8 @@ comments separately:
 
    parser = MyParser(
        text,
-       comments_re="\(\*.*?\*\)",
-       eol_comments_re="#.*?$"
+       comments="\(\*.*?\*\)",
+       eol_comments="#.*?$"
    )
 
 Both patterns may also be specified within a grammar using the
diff --git a/grammar/tatsu.ebnf b/grammar/tatsu.ebnf
index 870caae7..b955d6a2 100644
--- a/grammar/tatsu.ebnf
+++ b/grammar/tatsu.ebnf
@@ -1,7 +1,7 @@
 @@grammar :: TatSu
 @@whitespace :: /\s+/
 @@comments :: ?"(?sm)[(][*](?:.|\n)*?[*][)]"
-@@eol_comments :: ?"#[^\n]*$"
+@@eol_comments :: ?"(?m)#[^\n]*$"
 @@parseinfo :: True
 @@left_recursion :: False
 
diff --git a/tatsu/bootstrap.py b/tatsu/bootstrap.py
index 4f656b2a..87c925c7 100644
--- a/tatsu/bootstrap.py
+++ b/tatsu/bootstrap.py
@@ -35,8 +35,8 @@ def __init__(self, text, /, config: ParserConfig | None = None, **settings):
             ignorecase=False,
             namechars='',
             parseinfo=True,
-            comments_re='(?sm)[(][*](?:.|\\n)*?[*][)]',
-            eol_comments_re='#[^\\n]*$',
+            comments='(?sm)[(][*](?:.|\\n)*?[*][)]',
+            eol_comments='(?m)#[^\\n]*$',
             keywords=KEYWORDS,
             start='start',
         )
@@ -55,8 +55,8 @@ def __init__(self, /, config: ParserConfig | None = None, **settings):
             ignorecase=False,
             namechars='',
             parseinfo=True,
-            comments_re='(?sm)[(][*](?:.|\\n)*?[*][)]',
-            eol_comments_re='#[^\\n]*$',
+            comments='(?sm)[(][*](?:.|\\n)*?[*][)]',
+            eol_comments='(?m)#[^\\n]*$',
             keywords=KEYWORDS,
             start='start',
         )
diff --git a/tatsu/buffering.py b/tatsu/buffering.py
index 87358d99..5a2a91fd 100644
--- a/tatsu/buffering.py
+++ b/tatsu/buffering.py
@@ -357,7 +357,7 @@ def _scanre(self, pattern):
         if isinstance(pattern, RETYPE):
             cre = pattern
         else:
-            cre = re.compile(pattern, re.MULTILINE)
+            cre = re.compile(pattern)
         return cre.match(self.text, self.pos)
 
     @property
diff --git a/tatsu/codegen/objectmodel.py b/tatsu/codegen/objectmodel.py
index d52ea9df..bc787f59 100644
--- a/tatsu/codegen/objectmodel.py
+++ b/tatsu/codegen/objectmodel.py
@@ -67,11 +67,11 @@ def _get_full_name(cls):
     # Try to reference the class
     try:
         idents = name.split('.')
-        _cls = getattr(module, idents[0])
+        cls_ = getattr(module, idents[0])
         for ident in idents[1:]:
-            _cls = getattr(_cls, ident)
+            cls_ = getattr(cls_, ident)
 
-        assert _cls == cls
+        assert cls_ == cls
     except AttributeError as e:
         raise CodegenError(
             "Couldn't find base type, it has to be importable",
diff --git a/tatsu/codegen/python.py b/tatsu/codegen/python.py
index 31e0dea9..f25e1d8c 100755
--- a/tatsu/codegen/python.py
+++ b/tatsu/codegen/python.py
@@ -462,8 +462,8 @@ def render_fields(self, fields):
         left_recursion = self.node.config.left_recursion
         parseinfo = self.node.config.parseinfo
         namechars = repr(self.node.config.namechars or '')
-        comments_re = repr(self.node.config.comments_re)
-        eol_comments_re = repr(self.node.config.eol_comments_re)
+        comments = repr(self.node.config.comments)
+        eol_comments = repr(self.node.config.eol_comments)
 
         rules = '\n'.join(
             [self.get_renderer(rule).render() for rule in self.node.rules],
@@ -488,8 +488,8 @@ def render_fields(self, fields):
             parseinfo=parseinfo,
             keywords=keywords,
             namechars=namechars,
-            comments_re=comments_re,
-            eol_comments_re=eol_comments_re,
+            comments=comments,
+            eol_comments=eol_comments,
         )
 
     abstract_rule_template = """
@@ -535,8 +535,8 @@ def __init__(self, text, /, config: ParserConfig | None = None, **settings):
                             ignorecase={ignorecase},
                             namechars={namechars},
                             parseinfo={parseinfo},
-                            comments_re={comments_re},
-                            eol_comments_re={eol_comments_re},
+                            comments={comments},
+                            eol_comments={eol_comments},
                             keywords=KEYWORDS,
                             start={start!r},
                         )
@@ -554,8 +554,8 @@ def __init__(self, /, config: ParserConfig | None = None, **settings):
                             ignorecase={ignorecase},
                             namechars={namechars},
                             parseinfo={parseinfo},
-                            comments_re={comments_re},
-                            eol_comments_re={eol_comments_re},
+                            comments={comments},
+                            eol_comments={eol_comments},
                             left_recursion={left_recursion},
                             keywords=KEYWORDS,
                             start={start!r},
diff --git a/tatsu/g2e/semantics.py b/tatsu/g2e/semantics.py
index 982ed777..ccf0b497 100644
--- a/tatsu/g2e/semantics.py
+++ b/tatsu/g2e/semantics.py
@@ -9,7 +9,7 @@
 
 def camel2py(name):
     return re.sub(
-        '([a-z0-9])([A-Z])',
+        r'([a-z0-9])([A-Z])',
         lambda m: m.group(1) + '_' + m.group(2).lower(),
         name,
     )
diff --git a/tatsu/grammars.py b/tatsu/grammars.py
index 65def8b9..66f2173b 100644
--- a/tatsu/grammars.py
+++ b/tatsu/grammars.py
@@ -519,7 +519,7 @@ def _to_str(self, lean=False):
 
         if multi:
             return '\n|\n'.join(indent(o) for o in options)
-        elif len(options) and len(single) > PEP8_LLEN:
+        elif options and len(single) > PEP8_LLEN:
             return '| ' + '\n| '.join(o for o in options)
         else:
             return single
diff --git a/tatsu/infos.py b/tatsu/infos.py
index 6ec898ad..3bba14f2 100644
--- a/tatsu/infos.py
+++ b/tatsu/infos.py
@@ -3,7 +3,7 @@
 import copy
 import dataclasses
 import re
-from collections.abc import Callable, Mapping
+from collections.abc import Callable, MutableMapping
 from itertools import starmap
 from typing import Any, NamedTuple
 
@@ -30,8 +30,8 @@ class ParserConfig:
     start_rule: str | None = None  # FIXME
     rule_name: str | None = None  # Backward compatibility
 
-    comments_re: re.Pattern | None = None
-    eol_comments_re: re.Pattern | None = None
+    _comments_re: re.Pattern | None = dataclasses.field(default=None, init=False, repr=False)
+    _eol_comments_re: re.Pattern | None = dataclasses.field(default=None, init=False, repr=False)
 
     tokenizercls: type[Tokenizer] | None = None  # FIXME
     semantics: type | None = None
@@ -64,9 +64,17 @@ def __post_init__(self):  # pylint: disable=W0235
         if self.ignorecase:
             self.keywords = [k.upper() for k in self.keywords]
         if self.comments:
-            self.comments_re = re.compile(self.comments)
+            self._comments_re = re.compile(self.comments)
         if self.eol_comments:
-            self.eol_comments_re = re.compile(self.eol_comments)
+            self._eol_comments_re = re.compile(self.eol_comments)
+
+    @property
+    def comments_re(self) -> re.Pattern | None:
+        return self._comments_re
+
+    @property
+    def eol_comments_re(self) -> re.Pattern | None:
+        return self._eol_comments_re
 
     @classmethod
     def new(
@@ -84,7 +92,7 @@ def effective_rule_name(self):
         # note: there are legacy reasons for this mess
         return self.start_rule or self.rule_name or self.start
 
-    def _find_common(self, **settings: Any) -> Mapping[str, Any]:
+    def _find_common(self, **settings: Any) -> MutableMapping[str, Any]:
         return {
             name: value
             for name, value in settings.items()
@@ -101,8 +109,20 @@ def replace_config(
         else:
             return self.replace(**vars(other))
 
+    # non-init fields cannot be used as arguments in `replace`, however
+    # they are values returned by `vars` and `dataclass.asdict` so they
+    # must be filtered out.
+    # If the `ParserConfig` dataclass drops these fields, then this filter can be removed
+    def _filter_non_init_fields(self, settings: MutableMapping[str, Any]) -> MutableMapping[str, Any]:
+        for field in [
+            field.name for field in dataclasses.fields(self) if not field.init
+        ]:
+            if field in settings:
+                del settings[field]
+        return settings
+
     def replace(self, **settings: Any) -> ParserConfig:
-        overrides = self._find_common(**settings)
+        overrides = self._filter_non_init_fields(self._find_common(**settings))
         result = dataclasses.replace(self, **overrides)
         if 'grammar' in overrides:
             result.name = result.grammar
diff --git a/tatsu/ngcodegen/python.py b/tatsu/ngcodegen/python.py
index 6a83e5c5..76583377 100644
--- a/tatsu/ngcodegen/python.py
+++ b/tatsu/ngcodegen/python.py
@@ -323,8 +323,8 @@ def _gen_init(self, grammar: grammars.Grammar):
                     ignorecase={grammar.config.ignorecase},
                     namechars={grammar.config.namechars!r},
                     parseinfo={grammar.config.parseinfo},
-                    comments_re={grammar.config.comments_re!r},
-                    eol_comments_re={grammar.config.eol_comments_re!r},
+                    comments={grammar.config.comments!r},
+                    eol_comments={grammar.config.eol_comments!r},
                     keywords=KEYWORDS,
                     start={start!r},
                 )
diff --git a/tatsu/util/_common.py b/tatsu/util/_common.py
index c0819064..1123e9fb 100644
--- a/tatsu/util/_common.py
+++ b/tatsu/util/_common.py
@@ -27,7 +27,7 @@
 logger.addHandler(ch)
 
 
-RETYPE = type(re.compile('.'))
+RETYPE = re.Pattern
 
 
 ESCAPE_SEQUENCE_RE = re.compile(
diff --git a/tatsu/walkers.py b/tatsu/walkers.py
index 3de070ea..7762a4d7 100644
--- a/tatsu/walkers.py
+++ b/tatsu/walkers.py
@@ -74,7 +74,7 @@ def pythonize_match(m):
 
             # walk__pythonic_name with double underscore after walk
             pythonic_name = re.sub(
-                '[A-Z]+', pythonize_match, node_cls.__name__,
+                r'[A-Z]+', pythonize_match, node_cls.__name__,
             )
             if pythonic_name != cammelcase_name:
                 walker = getattr(cls, prefix + pythonic_name, None)
diff --git a/test/grammar/pattern_test.py b/test/grammar/pattern_test.py
index 91094fad..c651baf3 100644
--- a/test/grammar/pattern_test.py
+++ b/test/grammar/pattern_test.py
@@ -22,7 +22,7 @@ def test_patterns_with_newlines(self):
 
             blankline
                 =
-                /^[^\\n]*\\n$/
+                /(?m)^[^\\n]*\\n$/
                 ;
         """
 
diff --git a/test/grammar/syntax_test.py b/test/grammar/syntax_test.py
index b59b7bdf..f111a92b 100644
--- a/test/grammar/syntax_test.py
+++ b/test/grammar/syntax_test.py
@@ -352,7 +352,7 @@ def test_parse_hash():
         start = '#' ;
     """
 
-    parser = compile(grammar, eol_comments_re='')
+    parser = compile(grammar, eol_comments='')
     parser.parse('#', trace=True)
 
 
diff --git a/test/parser_equivalence_test.py b/test/parser_equivalence_test.py
index 02b4367f..62c6eaa1 100644
--- a/test/parser_equivalence_test.py
+++ b/test/parser_equivalence_test.py
@@ -171,6 +171,7 @@ def test_none_whitespace():
     output = parser.parse(input, parseinfo=False)
     assert output == ('This is a', ' test')
 
+
 def test_sep_join():
     grammar = """
     @@grammar::numbers
@@ -183,9 +184,7 @@ def test_sep_join():
         = ~ ( "," )%{ digit }+
         ;
 
-    digit = /\d+/ ;
+    digit = /\\d+/ ;
     """
     parser = generate_and_load_parser('W', grammar)
-    ast = parser.parse('1,2,3,4', nameguard=False)
-
-
+    parser.parse('1,2,3,4', nameguard=False)