From 02406677997eaaffbaf708a3b4005604b2839c19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juancarlo=20A=C3=B1ez?= <apalala@gmail.com>
Date: Fri, 3 Jan 2025 08:34:33 -0400
Subject: [PATCH] =?UTF-8?q?Make=20{eol=5F}comments=5Fre=20read-only=20and?=
 =?UTF-8?q?=20non-init=20arguments=20in=20`ParserCon=E2=80=A6=20(#353)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Deprecate ` {eol_}comments_re` in `ParserConfig` (#352)
* [buffering] drop forced multiline match for string patterns

Previously, when scanning for matches to a regex, if the type of the
pattern was `str`, the pattern was always compiled with `re.MULTILINE`.

Recent changes to `ParserConfig` [0] changed the type used for regex
matches in generated code from `str` to `re.Pattern` which could lead to
a difference in behavior from previous versions where a defined comments
or eol_comments may have been implicitly relying on the `re.MULTILINE`
flag.

After discussion [1], it has been determined that usage of `re` flags
within TatSu should be deprecated in favor of users specifying the
necessary flags within patterns.

As such, drop the `re.MULTILINE` flag for strings compiled on the fly.

---------

Co-authored-by: Vincent Fazio <vfazio@gmail.com>
Co-authored-by: Vincent Fazio <vfazio@xes-inc.com>
---
 docs/directives.rst          |  4 ++++
 docs/syntax.rst              |  8 +++----
 grammar/tatsu.ebnf           |  4 ++--
 tatsu/bootstrap.py           | 10 ++++----
 tatsu/buffering.py           |  6 ++---
 tatsu/codegen/python.py      | 16 ++++++-------
 tatsu/infos.py               | 34 +++++++++++++++++++--------
 tatsu/ngcodegen/python.py    |  4 ++--
 test/grammar/pattern_test.py |  2 +-
 test/grammar/syntax_test.py  | 45 +++++++++++++++++++++++++++++++++++-
 10 files changed, 98 insertions(+), 35 deletions(-)
diff --git a/docs/directives.rst b/docs/directives.rst
index 82852984..4a765dfe 100644
--- a/docs/directives.rst
+++ b/docs/directives.rst
@@ -29,6 +29,8 @@ Specifies a regular expression to identify and exclude inline (bracketed) commen
 
     @@comments :: /\(\*((?:.|\n)*?)\*\)/
 
+.. note::
+   Prior to 5.12.1, comments implicitly had the `(?m) <https://docs.python.org/3/library/re.html#re.MULTILINE>`_ option defined. This is no longer the case.
 
 ``@@eol_comments :: <regexp>``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -39,6 +41,8 @@ Specifies a regular expression to identify and exclude end-of-line comments befo
 
     @@eol_comments :: /#([^\n]*?)$/
 
+.. note::
+   Prior to 5.12.1, eol_comments implicitly had the `(?m) <https://docs.python.org/3/library/re.html#re.MULTILINE>`_ option defined. This is no longer the case.
 
 ``@@ignorecase :: <bool>``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/syntax.rst b/docs/syntax.rst
index 1a5ad10c..b1dcc603 100644
--- a/docs/syntax.rst
+++ b/docs/syntax.rst
@@ -735,11 +735,11 @@ Comments
 ~~~~~~~~
 
 Parsers will skip over comments specified as a regular expression using
-the ``comments_re`` parameter:
+the ``comments`` parameter:
 
 .. code:: python
 
-   parser = MyParser(text, comments_re="\(\*.*?\*\)")
+   parser = MyParser(text, comments="\(\*.*?\*\)")
 
 For more complex comment handling, you can override the
 ``Buffer.eat_comments()`` method.
@@ -751,8 +751,8 @@ comments separately:
 
    parser = MyParser(
        text,
-       comments_re="\(\*.*?\*\)",
-       eol_comments_re="#.*?$"
+       comments="\(\*.*?\*\)",
+       eol_comments="#.*?$"
    )
 
 Both patterns may also be specified within a grammar using the
diff --git a/grammar/tatsu.ebnf b/grammar/tatsu.ebnf
index 870caae7..3cb48d51 100644
--- a/grammar/tatsu.ebnf
+++ b/grammar/tatsu.ebnf
@@ -1,7 +1,7 @@
 @@grammar :: TatSu
-@@whitespace :: /\s+/
+@@whitespace :: /(?m)\s+/
 @@comments :: ?"(?sm)[(][*](?:.|\n)*?[*][)]"
-@@eol_comments :: ?"#[^\n]*$"
+@@eol_comments :: ?"(?m)#[^\n]*$"
 @@parseinfo :: True
 @@left_recursion :: False
 
diff --git a/tatsu/bootstrap.py b/tatsu/bootstrap.py
index 4f656b2a..84e5205c 100644
--- a/tatsu/bootstrap.py
+++ b/tatsu/bootstrap.py
@@ -30,13 +30,13 @@ def __init__(self, text, /, config: ParserConfig | None = None, **settings):
         config = ParserConfig.new(
             config,
             owner=self,
-            whitespace='\\s+',
+            whitespace='(?m)\\s+',
             nameguard=None,
             ignorecase=False,
             namechars='',
             parseinfo=True,
-            comments_re='(?sm)[(][*](?:.|\\n)*?[*][)]',
-            eol_comments_re='#[^\\n]*$',
+            comments='(?sm)[(][*](?:.|\\n)*?[*][)]',
+            eol_comments='(?sm)*#[^\\n]*$',
             keywords=KEYWORDS,
             start='start',
         )
@@ -55,8 +55,8 @@ def __init__(self, /, config: ParserConfig | None = None, **settings):
             ignorecase=False,
             namechars='',
             parseinfo=True,
-            comments_re='(?sm)[(][*](?:.|\\n)*?[*][)]',
-            eol_comments_re='#[^\\n]*$',
+            comments='(?sm)[(][*](?:.|\\n)*?[*][)]',
+            eol_comments='(?m)#[^\\n]*$',
             keywords=KEYWORDS,
             start='start',
         )
diff --git a/tatsu/buffering.py b/tatsu/buffering.py
index bf3a58a7..81f12ba2 100644
--- a/tatsu/buffering.py
+++ b/tatsu/buffering.py
@@ -268,11 +268,11 @@ def eat_whitespace(self):
         return self._eat_regex(self.whitespace_re)
 
     def eat_comments(self):
-        comments = self._eat_regex_list(self.config.comments_re)
+        comments = self._eat_regex_list(self.config.comments)
         self._index_comments(comments, lambda x: x.inline)
 
     def eat_eol_comments(self):
-        comments = self._eat_regex_list(self.config.eol_comments_re)
+        comments = self._eat_regex_list(self.config.eol_comments)
         self._index_comments(comments, lambda x: x.eol)
 
     def next_token(self):
@@ -356,7 +356,7 @@ def _scanre(self, pattern):
         if isinstance(pattern, re.Pattern):
             cre = pattern
         else:
-            cre = re.compile(pattern, re.MULTILINE)
+            cre = re.compile(pattern)
         return cre.match(self.text, self.pos)
 
     @property
diff --git a/tatsu/codegen/python.py b/tatsu/codegen/python.py
index d1d2fdcd..ecd0c390 100755
--- a/tatsu/codegen/python.py
+++ b/tatsu/codegen/python.py
@@ -463,8 +463,8 @@ def render_fields(self, fields):
         left_recursion = self.node.config.left_recursion
         parseinfo = self.node.config.parseinfo
         namechars = repr(self.node.config.namechars or '')
-        comments_re = repr(self.node.config.comments_re)
-        eol_comments_re = repr(self.node.config.eol_comments_re)
+        comments = repr(self.node.config.comments)
+        eol_comments = repr(self.node.config.eol_comments)
 
         rules = '\n'.join(
             [self.get_renderer(rule).render() for rule in self.node.rules],
@@ -489,8 +489,8 @@ def render_fields(self, fields):
             parseinfo=parseinfo,
             keywords=keywords,
             namechars=namechars,
-            comments_re=comments_re,
-            eol_comments_re=eol_comments_re,
+            comments=comments,
+            eol_comments=eol_comments,
         )
 
     abstract_rule_template = """
@@ -536,8 +536,8 @@ def __init__(self, text, /, config: ParserConfig | None = None, **settings):
                             ignorecase={ignorecase},
                             namechars={namechars},
                             parseinfo={parseinfo},
-                            comments_re={comments_re},
-                            eol_comments_re={eol_comments_re},
+                            comments={comments},
+                            eol_comments={eol_comments},
                             keywords=KEYWORDS,
                             start={start!r},
                         )
@@ -555,8 +555,8 @@ def __init__(self, /, config: ParserConfig | None = None, **settings):
                             ignorecase={ignorecase},
                             namechars={namechars},
                             parseinfo={parseinfo},
-                            comments_re={comments_re},
-                            eol_comments_re={eol_comments_re},
+                            comments={comments},
+                            eol_comments={eol_comments},
                             left_recursion={left_recursion},
                             keywords=KEYWORDS,
                             start={start!r},
diff --git a/tatsu/infos.py b/tatsu/infos.py
index 0efb982f..c201dd25 100644
--- a/tatsu/infos.py
+++ b/tatsu/infos.py
@@ -3,7 +3,7 @@
 import copy
 import dataclasses
 import re
-from collections.abc import Callable, Mapping
+from collections.abc import Callable, MutableMapping
 from itertools import starmap
 from typing import Any, NamedTuple
 
@@ -30,8 +30,8 @@ class ParserConfig:
     start_rule: str | None = None  # FIXME
     rule_name: str | None = None  # Backward compatibility
 
-    comments_re: re.Pattern | None = None
-    eol_comments_re: re.Pattern | None = None
+    comments_re: re.Pattern | str | None = None
+    eol_comments_re: re.Pattern | str | None = None
 
     tokenizercls: type[Tokenizer] | None = None  # FIXME
     semantics: type | None = None
@@ -63,10 +63,14 @@ class ParserConfig:
     def __post_init__(self):  # pylint: disable=W0235
         if self.ignorecase:
             self.keywords = [k.upper() for k in self.keywords]
-        if self.comments:
-            self.comments_re = re.compile(self.comments)
-        if self.eol_comments:
-            self.eol_comments_re = re.compile(self.eol_comments)
+
+        if self.comments_re or self.eol_comments_re:
+            raise AttributeError("""\
+                Both `comments_re` and `eol_comments_re` have been removed from parser configuration.
+                Please use `comments` and/or `eol_comments` instead`.
+            """)
+        del self.comments_re
+        del self.eol_comments_re
 
     @classmethod
     def new(
@@ -84,7 +88,7 @@ def effective_rule_name(self):
         # note: there are legacy reasons for this mess
         return self.start_rule or self.rule_name or self.start
 
-    def _find_common(self, **settings: Any) -> Mapping[str, Any]:
+    def _find_common(self, **settings: Any) -> MutableMapping[str, Any]:
         return {
             name: value
             for name, value in settings.items()
@@ -101,8 +105,20 @@ def replace_config(
         else:
             return self.replace(**vars(other))
 
+    # non-init fields cannot be used as arguments in `replace`, however
+    # they are values returned by `vars` and `dataclass.asdict` so they
+    # must be filtered out.
+    # If the `ParserConfig` dataclass drops these fields, then this filter can be removed
+    def _filter_non_init_fields(self, settings: MutableMapping[str, Any]) -> MutableMapping[str, Any]:
+        for field in [
+            field.name for field in dataclasses.fields(self) if not field.init
+        ]:
+            if field in settings:
+                del settings[field]
+        return settings
+
     def replace(self, **settings: Any) -> ParserConfig:
-        overrides = self._find_common(**settings)
+        overrides = self._filter_non_init_fields(self._find_common(**settings))
         result = dataclasses.replace(self, **overrides)
         if 'grammar' in overrides:
             result.name = result.grammar
diff --git a/tatsu/ngcodegen/python.py b/tatsu/ngcodegen/python.py
index 6a83e5c5..76583377 100644
--- a/tatsu/ngcodegen/python.py
+++ b/tatsu/ngcodegen/python.py
@@ -323,8 +323,8 @@ def _gen_init(self, grammar: grammars.Grammar):
                     ignorecase={grammar.config.ignorecase},
                     namechars={grammar.config.namechars!r},
                     parseinfo={grammar.config.parseinfo},
-                    comments_re={grammar.config.comments_re!r},
-                    eol_comments_re={grammar.config.eol_comments_re!r},
+                    comments={grammar.config.comments!r},
+                    eol_comments={grammar.config.eol_comments!r},
                     keywords=KEYWORDS,
                     start={start!r},
                 )
diff --git a/test/grammar/pattern_test.py b/test/grammar/pattern_test.py
index 91094fad..c651baf3 100644
--- a/test/grammar/pattern_test.py
+++ b/test/grammar/pattern_test.py
@@ -22,7 +22,7 @@ def test_patterns_with_newlines(self):
 
             blankline
                 =
-                /^[^\\n]*\\n$/
+                /(?m)^[^\\n]*\\n$/
                 ;
         """
 
diff --git a/test/grammar/syntax_test.py b/test/grammar/syntax_test.py
index b59b7bdf..e63c511d 100644
--- a/test/grammar/syntax_test.py
+++ b/test/grammar/syntax_test.py
@@ -352,7 +352,7 @@ def test_parse_hash():
         start = '#' ;
     """
 
-    parser = compile(grammar, eol_comments_re='')
+    parser = compile(grammar, eol_comments='')
     parser.parse('#', trace=True)
 
 
@@ -377,3 +377,46 @@ def test_no_default_comments():
     """
     with pytest.raises(FailedToken):
         tool.parse(grammar, text)
+
+
+import re
+
+
+@pytest.mark.parametrize(
+    "comment,option",
+    [
+        pytest.param(
+            "# This comment should be stripped",
+            {
+                "eol_comments_re": re.compile(r"(?m)#.*?$"),
+                "eol_comments": r"(?m)#.*?$",
+            },
+            id="eol_comments override",
+        ),
+        pytest.param(
+            "(* This comment should be stripped *)",
+            {
+                "comments_re": re.compile(r"(?sm)[(][*](?:.|\n)*?[*][)]"),
+                "comments": r"(?sm)[(][*](?:.|\n)*?[*][)]",
+            },
+            id="comments override",
+        ),
+    ],
+)
+def test_deprecated_comments_override_failures(comment, option):
+    """
+    # TODO: remove this test after {eol_}comments_re are no longer supported
+    """
+    grammar = """
+        @@comments :: /@@@@@@/
+        @@eol_comments :: /@@@@@@/
+
+        start = 'a' $;
+    """
+
+    text = f"""
+        {comment}
+        a
+    """
+    with pytest.raises(AttributeError, match=""):
+        tool.parse(grammar, text, **option)