From bd1c0a42eab54fa295d4817cad74c89078425116 Mon Sep 17 00:00:00 2001 From: Vincent Fazio Date: Fri, 27 Dec 2024 09:54:16 -0600 Subject: [PATCH 01/11] [buffering] drop forced multiline match for string patterns Previously, when scanning for matches to a regex, if the type of the pattern was `str`, the pattern was always compiled with `re.MULTILINE`. Recent changes to `ParserConfig` [0] changed the type used for regex matches in generated code from `str` to `re.Pattern` which could lead to a difference in behavior from previous versions where a defined comments or eol_comments may have been implicitly relying on the `re.MULTILINE` flag. After discussion [1], it has been determined that usage of `re` flags within TatSu should be deprecated in favor of users specifying the necessary flags within patterns. As such, drop the `re.MULTILINE` flag for strings compiled on the fly. [0]: https://github.com/neogeny/TatSu/pull/338 [1]: https://github.com/neogeny/TatSu/issues/351#issuecomment-2563635784 --- tatsu/buffering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tatsu/buffering.py b/tatsu/buffering.py index 87358d99..5a2a91fd 100644 --- a/tatsu/buffering.py +++ b/tatsu/buffering.py @@ -357,7 +357,7 @@ def _scanre(self, pattern): if isinstance(pattern, RETYPE): cre = pattern else: - cre = re.compile(pattern, re.MULTILINE) + cre = re.compile(pattern) return cre.match(self.text, self.pos) @property From 9ba28d5fd1977ab00beb004f9133f805492261f7 Mon Sep 17 00:00:00 2001 From: Vincent Fazio Date: Fri, 27 Dec 2024 09:53:02 -0600 Subject: [PATCH 02/11] [grammar] make eol_comments multiline match Make the default eol_comments regex use multiline matching. Recent changes to `ParserConfig` [0] now use a precompiled regex (an `re.Pattern`) instead of compiling the `str` regex on the fly. The `Tokenizer` previously assumed `str` type regexes should all be `re.MULTILINE` regardless of options defined in the regex itself when compiling the pattern. This behavior has since changed to no longer automatically apply and thus requires configurations to specify the option in the pattern. [0]: https://github.com/neogeny/TatSu/pull/338 --- grammar/tatsu.ebnf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grammar/tatsu.ebnf b/grammar/tatsu.ebnf index 870caae7..b955d6a2 100644 --- a/grammar/tatsu.ebnf +++ b/grammar/tatsu.ebnf @@ -1,7 +1,7 @@ @@grammar :: TatSu @@whitespace :: /\s+/ @@comments :: ?"(?sm)[(][*](?:.|\n)*?[*][)]" -@@eol_comments :: ?"#[^\n]*$" +@@eol_comments :: ?"(?m)#[^\n]*$" @@parseinfo :: True @@left_recursion :: False From adbc2f269c8eeae7bcd8291a633ff6a16e59543b Mon Sep 17 00:00:00 2001 From: Vincent Fazio Date: Fri, 27 Dec 2024 13:49:30 -0600 Subject: [PATCH 03/11] [infos] make {eol_}comments_re read-only attributes Previously, the `eol_comments_re` and `comments_re` attributes were public init arguments, were modifiable, and could thus become out of sync with the `eol_comments` and `comments` attributes. Also, with recent changes to `ParserConfig` [0], there were two ways to initialize the regex values for comments and eol_comments directives; either via the constructor using the *_re variables or by using the sister string arguments and relying on `__post_init__` to compile the values which trumped the explicit *_re argument values. Now, the constructor interface has been simplified to not take either `eol_comments_re` or `comments_re` as arguments. Callers may only use `eol_comments` and `comments`. The `eol_comments_re` and `comments_re` attributes are still public, but are read-only so they are always a reflection of their sister string values passed into the constructor. [0]: https://github.com/neogeny/TatSu/pull/200 --- tatsu/infos.py | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/tatsu/infos.py b/tatsu/infos.py index 6ec898ad..3bba14f2 100644 --- a/tatsu/infos.py +++ b/tatsu/infos.py @@ -3,7 +3,7 @@ import copy import dataclasses import re -from collections.abc import Callable, Mapping +from collections.abc import Callable, MutableMapping from itertools import starmap from typing import Any, NamedTuple @@ -30,8 +30,8 @@ class ParserConfig: start_rule: str | None = None # FIXME rule_name: str | None = None # Backward compatibility - comments_re: re.Pattern | None = None - eol_comments_re: re.Pattern | None = None + _comments_re: re.Pattern | None = dataclasses.field(default=None, init=False, repr=False) + _eol_comments_re: re.Pattern | None = dataclasses.field(default=None, init=False, repr=False) tokenizercls: type[Tokenizer] | None = None # FIXME semantics: type | None = None @@ -64,9 +64,17 @@ def __post_init__(self): # pylint: disable=W0235 if self.ignorecase: self.keywords = [k.upper() for k in self.keywords] if self.comments: - self.comments_re = re.compile(self.comments) + self._comments_re = re.compile(self.comments) if self.eol_comments: - self.eol_comments_re = re.compile(self.eol_comments) + self._eol_comments_re = re.compile(self.eol_comments) + + @property + def comments_re(self) -> re.Pattern | None: + return self._comments_re + + @property + def eol_comments_re(self) -> re.Pattern | None: + return self._eol_comments_re @classmethod def new( @@ -84,7 +92,7 @@ def effective_rule_name(self): # note: there are legacy reasons for this mess return self.start_rule or self.rule_name or self.start - def _find_common(self, **settings: Any) -> Mapping[str, Any]: + def _find_common(self, **settings: Any) -> MutableMapping[str, Any]: return { name: value for name, value in settings.items() @@ -101,8 +109,20 @@ def replace_config( else: return self.replace(**vars(other)) + # non-init fields cannot be used as arguments in `replace`, however + # they are values returned by `vars` and `dataclass.asdict` so they + # must be filtered out. + # If the `ParserConfig` dataclass drops these fields, then this filter can be removed + def _filter_non_init_fields(self, settings: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + for field in [ + field.name for field in dataclasses.fields(self) if not field.init + ]: + if field in settings: + del settings[field] + return settings + def replace(self, **settings: Any) -> ParserConfig: - overrides = self._find_common(**settings) + overrides = self._filter_non_init_fields(self._find_common(**settings)) result = dataclasses.replace(self, **overrides) if 'grammar' in overrides: result.name = result.grammar From 4bdd4a54124de6164ad4164658856180473b796d Mon Sep 17 00:00:00 2001 From: Vincent Fazio Date: Fri, 27 Dec 2024 14:39:14 -0600 Subject: [PATCH 04/11] [codegen] migrate to {eol_}comments --- tatsu/codegen/python.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tatsu/codegen/python.py b/tatsu/codegen/python.py index 31e0dea9..f25e1d8c 100755 --- a/tatsu/codegen/python.py +++ b/tatsu/codegen/python.py @@ -462,8 +462,8 @@ def render_fields(self, fields): left_recursion = self.node.config.left_recursion parseinfo = self.node.config.parseinfo namechars = repr(self.node.config.namechars or '') - comments_re = repr(self.node.config.comments_re) - eol_comments_re = repr(self.node.config.eol_comments_re) + comments = repr(self.node.config.comments) + eol_comments = repr(self.node.config.eol_comments) rules = '\n'.join( [self.get_renderer(rule).render() for rule in self.node.rules], @@ -488,8 +488,8 @@ def render_fields(self, fields): parseinfo=parseinfo, keywords=keywords, namechars=namechars, - comments_re=comments_re, - eol_comments_re=eol_comments_re, + comments=comments, + eol_comments=eol_comments, ) abstract_rule_template = """ @@ -535,8 +535,8 @@ def __init__(self, text, /, config: ParserConfig | None = None, **settings): ignorecase={ignorecase}, namechars={namechars}, parseinfo={parseinfo}, - comments_re={comments_re}, - eol_comments_re={eol_comments_re}, + comments={comments}, + eol_comments={eol_comments}, keywords=KEYWORDS, start={start!r}, ) @@ -554,8 +554,8 @@ def __init__(self, /, config: ParserConfig | None = None, **settings): ignorecase={ignorecase}, namechars={namechars}, parseinfo={parseinfo}, - comments_re={comments_re}, - eol_comments_re={eol_comments_re}, + comments={comments}, + eol_comments={eol_comments}, left_recursion={left_recursion}, keywords=KEYWORDS, start={start!r}, From 42cb810440e7f1fe062d942d50f9871cb48cb4bf Mon Sep 17 00:00:00 2001 From: Vincent Fazio Date: Sat, 28 Dec 2024 18:04:39 -0600 Subject: [PATCH 05/11] [ngcodegen] migrate to {eol_}comments --- tatsu/ngcodegen/python.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tatsu/ngcodegen/python.py b/tatsu/ngcodegen/python.py index 6a83e5c5..76583377 100644 --- a/tatsu/ngcodegen/python.py +++ b/tatsu/ngcodegen/python.py @@ -323,8 +323,8 @@ def _gen_init(self, grammar: grammars.Grammar): ignorecase={grammar.config.ignorecase}, namechars={grammar.config.namechars!r}, parseinfo={grammar.config.parseinfo}, - comments_re={grammar.config.comments_re!r}, - eol_comments_re={grammar.config.eol_comments_re!r}, + comments={grammar.config.comments!r}, + eol_comments={grammar.config.eol_comments!r}, keywords=KEYWORDS, start={start!r}, ) From 9160c08638dea336783106fcdca6b232459afcf2 Mon Sep 17 00:00:00 2001 From: Vincent Fazio Date: Fri, 27 Dec 2024 09:53:19 -0600 Subject: [PATCH 06/11] [bootstrap] migrate to {eol_}comments --- tatsu/bootstrap.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tatsu/bootstrap.py b/tatsu/bootstrap.py index 4f656b2a..87c925c7 100644 --- a/tatsu/bootstrap.py +++ b/tatsu/bootstrap.py @@ -35,8 +35,8 @@ def __init__(self, text, /, config: ParserConfig | None = None, **settings): ignorecase=False, namechars='', parseinfo=True, - comments_re='(?sm)[(][*](?:.|\\n)*?[*][)]', - eol_comments_re='#[^\\n]*$', + comments='(?sm)[(][*](?:.|\\n)*?[*][)]', + eol_comments='(?m)#[^\\n]*$', keywords=KEYWORDS, start='start', ) @@ -55,8 +55,8 @@ def __init__(self, /, config: ParserConfig | None = None, **settings): ignorecase=False, namechars='', parseinfo=True, - comments_re='(?sm)[(][*](?:.|\\n)*?[*][)]', - eol_comments_re='#[^\\n]*$', + comments='(?sm)[(][*](?:.|\\n)*?[*][)]', + eol_comments='(?m)#[^\\n]*$', keywords=KEYWORDS, start='start', ) From 03d4b7ff778272000b869d3fdef31b17d3903796 Mon Sep 17 00:00:00 2001 From: Vincent Fazio Date: Sat, 28 Dec 2024 15:16:10 -0600 Subject: [PATCH 07/11] [lint] resolve errors --- tatsu/codegen/objectmodel.py | 6 +++--- tatsu/g2e/semantics.py | 2 +- tatsu/grammars.py | 2 +- tatsu/util/_common.py | 2 +- tatsu/walkers.py | 2 +- test/parser_equivalence_test.py | 7 +++---- 6 files changed, 10 insertions(+), 11 deletions(-) diff --git a/tatsu/codegen/objectmodel.py b/tatsu/codegen/objectmodel.py index d52ea9df..bc787f59 100644 --- a/tatsu/codegen/objectmodel.py +++ b/tatsu/codegen/objectmodel.py @@ -67,11 +67,11 @@ def _get_full_name(cls): # Try to reference the class try: idents = name.split('.') - _cls = getattr(module, idents[0]) + cls_ = getattr(module, idents[0]) for ident in idents[1:]: - _cls = getattr(_cls, ident) + cls_ = getattr(cls_, ident) - assert _cls == cls + assert cls_ == cls except AttributeError as e: raise CodegenError( "Couldn't find base type, it has to be importable", diff --git a/tatsu/g2e/semantics.py b/tatsu/g2e/semantics.py index 982ed777..ccf0b497 100644 --- a/tatsu/g2e/semantics.py +++ b/tatsu/g2e/semantics.py @@ -9,7 +9,7 @@ def camel2py(name): return re.sub( - '([a-z0-9])([A-Z])', + r'([a-z0-9])([A-Z])', lambda m: m.group(1) + '_' + m.group(2).lower(), name, ) diff --git a/tatsu/grammars.py b/tatsu/grammars.py index 65def8b9..66f2173b 100644 --- a/tatsu/grammars.py +++ b/tatsu/grammars.py @@ -519,7 +519,7 @@ def _to_str(self, lean=False): if multi: return '\n|\n'.join(indent(o) for o in options) - elif len(options) and len(single) > PEP8_LLEN: + elif options and len(single) > PEP8_LLEN: return '| ' + '\n| '.join(o for o in options) else: return single diff --git a/tatsu/util/_common.py b/tatsu/util/_common.py index c0819064..1123e9fb 100644 --- a/tatsu/util/_common.py +++ b/tatsu/util/_common.py @@ -27,7 +27,7 @@ logger.addHandler(ch) -RETYPE = type(re.compile('.')) +RETYPE = re.Pattern ESCAPE_SEQUENCE_RE = re.compile( diff --git a/tatsu/walkers.py b/tatsu/walkers.py index 3de070ea..7762a4d7 100644 --- a/tatsu/walkers.py +++ b/tatsu/walkers.py @@ -74,7 +74,7 @@ def pythonize_match(m): # walk__pythonic_name with double underscore after walk pythonic_name = re.sub( - '[A-Z]+', pythonize_match, node_cls.__name__, + r'[A-Z]+', pythonize_match, node_cls.__name__, ) if pythonic_name != cammelcase_name: walker = getattr(cls, prefix + pythonic_name, None) diff --git a/test/parser_equivalence_test.py b/test/parser_equivalence_test.py index 02b4367f..62c6eaa1 100644 --- a/test/parser_equivalence_test.py +++ b/test/parser_equivalence_test.py @@ -171,6 +171,7 @@ def test_none_whitespace(): output = parser.parse(input, parseinfo=False) assert output == ('This is a', ' test') + def test_sep_join(): grammar = """ @@grammar::numbers @@ -183,9 +184,7 @@ def test_sep_join(): = ~ ( "," )%{ digit }+ ; - digit = /\d+/ ; + digit = /\\d+/ ; """ parser = generate_and_load_parser('W', grammar) - ast = parser.parse('1,2,3,4', nameguard=False) - - + parser.parse('1,2,3,4', nameguard=False) From 923a67833b546a71fc967877515b134b69a8dd1e Mon Sep 17 00:00:00 2001 From: Vincent Fazio Date: Sat, 28 Dec 2024 17:54:23 -0600 Subject: [PATCH 08/11] [docs] note {eol_}comments directive behavior changes --- docs/directives.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/directives.rst b/docs/directives.rst index 82852984..4a765dfe 100644 --- a/docs/directives.rst +++ b/docs/directives.rst @@ -29,6 +29,8 @@ Specifies a regular expression to identify and exclude inline (bracketed) commen @@comments :: /\(\*((?:.|\n)*?)\*\)/ +.. note:: + Prior to 5.12.1, comments implicitly had the `(?m) `_ option defined. This is no longer the case. ``@@eol_comments :: `` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -39,6 +41,8 @@ Specifies a regular expression to identify and exclude end-of-line comments befo @@eol_comments :: /#([^\n]*?)$/ +.. note:: + Prior to 5.12.1, eol_comments implicitly had the `(?m) `_ option defined. This is no longer the case. ``@@ignorecase :: `` ~~~~~~~~~~~~~~~~~~~~~~~~~~ From ddbe27f46c9364f2ebe363ad06ada964cfdf6ea3 Mon Sep 17 00:00:00 2001 From: Vincent Fazio Date: Sat, 28 Dec 2024 17:54:57 -0600 Subject: [PATCH 09/11] [docs] update syntax to reflect {eol_}comments arguments --- docs/syntax.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/syntax.rst b/docs/syntax.rst index 1a5ad10c..b1dcc603 100644 --- a/docs/syntax.rst +++ b/docs/syntax.rst @@ -735,11 +735,11 @@ Comments ~~~~~~~~ Parsers will skip over comments specified as a regular expression using -the ``comments_re`` parameter: +the ``comments`` parameter: .. code:: python - parser = MyParser(text, comments_re="\(\*.*?\*\)") + parser = MyParser(text, comments="\(\*.*?\*\)") For more complex comment handling, you can override the ``Buffer.eat_comments()`` method. @@ -751,8 +751,8 @@ comments separately: parser = MyParser( text, - comments_re="\(\*.*?\*\)", - eol_comments_re="#.*?$" + comments="\(\*.*?\*\)", + eol_comments="#.*?$" ) Both patterns may also be specified within a grammar using the From dacb978a25d2563a763dc0dcd10e591823fbe191 Mon Sep 17 00:00:00 2001 From: Vincent Fazio Date: Sat, 28 Dec 2024 18:12:56 -0600 Subject: [PATCH 10/11] [test] fix test_parse_hash to use eol_comments --- test/grammar/syntax_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/grammar/syntax_test.py b/test/grammar/syntax_test.py index b59b7bdf..f111a92b 100644 --- a/test/grammar/syntax_test.py +++ b/test/grammar/syntax_test.py @@ -352,7 +352,7 @@ def test_parse_hash(): start = '#' ; """ - parser = compile(grammar, eol_comments_re='') + parser = compile(grammar, eol_comments='') parser.parse('#', trace=True) From fdad7932744dd91502d871870ac390ba74272026 Mon Sep 17 00:00:00 2001 From: Vincent Fazio Date: Sat, 28 Dec 2024 18:16:12 -0600 Subject: [PATCH 11/11] [test] explicitly use multiline match in test_patterns_with_newlines --- test/grammar/pattern_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/grammar/pattern_test.py b/test/grammar/pattern_test.py index 91094fad..c651baf3 100644 --- a/test/grammar/pattern_test.py +++ b/test/grammar/pattern_test.py @@ -22,7 +22,7 @@ def test_patterns_with_newlines(self): blankline = - /^[^\\n]*\\n$/ + /(?m)^[^\\n]*\\n$/ ; """