From 7e4f8f4b8dafbfbb90ca3f9638fa31024c4eb2db Mon Sep 17 00:00:00 2001 From: Vincent Fazio Date: Sun, 29 Dec 2024 09:01:20 -0600 Subject: [PATCH] Make {eol_}comments_re read-only and non-init arguments in `ParserConfig` (#352) * [buffering] drop forced multiline match for string patterns Previously, when scanning for matches to a regex, if the type of the pattern was `str`, the pattern was always compiled with `re.MULTILINE`. Recent changes to `ParserConfig` [0] changed the type used for regex matches in generated code from `str` to `re.Pattern` which could lead to a difference in behavior from previous versions where a defined comments or eol_comments may have been implicitly relying on the `re.MULTILINE` flag. After discussion [1], it has been determined that usage of `re` flags within TatSu should be deprecated in favor of users specifying the necessary flags within patterns. As such, drop the `re.MULTILINE` flag for strings compiled on the fly. [0]: https://github.com/neogeny/TatSu/pull/338 [1]: https://github.com/neogeny/TatSu/issues/351#issuecomment-2563635784 * [grammar] make eol_comments multiline match Make the default eol_comments regex use multiline matching. Recent changes to `ParserConfig` [0] now use a precompiled regex (an `re.Pattern`) instead of compiling the `str` regex on the fly. The `Tokenizer` previously assumed `str` type regexes should all be `re.MULTILINE` regardless of options defined in the regex itself when compiling the pattern. This behavior has since changed to no longer automatically apply and thus requires configurations to specify the option in the pattern. [0]: https://github.com/neogeny/TatSu/pull/338 * [infos] make {eol_}comments_re read-only attributes Previously, the `eol_comments_re` and `comments_re` attributes were public init arguments, were modifiable, and could thus become out of sync with the `eol_comments` and `comments` attributes. Also, with recent changes to `ParserConfig` [0], there were two ways to initialize the regex values for comments and eol_comments directives; either via the constructor using the *_re variables or by using the sister string arguments and relying on `__post_init__` to compile the values which trumped the explicit *_re argument values. Now, the constructor interface has been simplified to not take either `eol_comments_re` or `comments_re` as arguments. Callers may only use `eol_comments` and `comments`. The `eol_comments_re` and `comments_re` attributes are still public, but are read-only so they are always a reflection of their sister string values passed into the constructor. [0]: https://github.com/neogeny/TatSu/pull/200 * [codegen] migrate to {eol_}comments * [ngcodegen] migrate to {eol_}comments * [bootstrap] migrate to {eol_}comments * [lint] resolve errors * [docs] note {eol_}comments directive behavior changes * [docs] update syntax to reflect {eol_}comments arguments * [test] fix test_parse_hash to use eol_comments * [test] explicitly use multiline match in test_patterns_with_newlines --- docs/directives.rst | 4 ++++ docs/syntax.rst | 8 ++++---- grammar/tatsu.ebnf | 2 +- tatsu/bootstrap.py | 8 ++++---- tatsu/buffering.py | 2 +- tatsu/codegen/objectmodel.py | 6 +++--- tatsu/codegen/python.py | 16 ++++++++-------- tatsu/g2e/semantics.py | 2 +- tatsu/grammars.py | 2 +- tatsu/infos.py | 34 ++++++++++++++++++++++++++------- tatsu/ngcodegen/python.py | 4 ++-- tatsu/util/_common.py | 2 +- tatsu/walkers.py | 2 +- test/grammar/pattern_test.py | 2 +- test/grammar/syntax_test.py | 2 +- test/parser_equivalence_test.py | 7 +++---- 16 files changed, 63 insertions(+), 40 deletions(-) diff --git a/docs/directives.rst b/docs/directives.rst index 82852984..4a765dfe 100644 --- a/docs/directives.rst +++ b/docs/directives.rst @@ -29,6 +29,8 @@ Specifies a regular expression to identify and exclude inline (bracketed) commen @@comments :: /\(\*((?:.|\n)*?)\*\)/ +.. note:: + Prior to 5.12.1, comments implicitly had the `(?m) `_ option defined. This is no longer the case. ``@@eol_comments :: `` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -39,6 +41,8 @@ Specifies a regular expression to identify and exclude end-of-line comments befo @@eol_comments :: /#([^\n]*?)$/ +.. note:: + Prior to 5.12.1, eol_comments implicitly had the `(?m) `_ option defined. This is no longer the case. ``@@ignorecase :: `` ~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/syntax.rst b/docs/syntax.rst index 1a5ad10c..b1dcc603 100644 --- a/docs/syntax.rst +++ b/docs/syntax.rst @@ -735,11 +735,11 @@ Comments ~~~~~~~~ Parsers will skip over comments specified as a regular expression using -the ``comments_re`` parameter: +the ``comments`` parameter: .. code:: python - parser = MyParser(text, comments_re="\(\*.*?\*\)") + parser = MyParser(text, comments="\(\*.*?\*\)") For more complex comment handling, you can override the ``Buffer.eat_comments()`` method. @@ -751,8 +751,8 @@ comments separately: parser = MyParser( text, - comments_re="\(\*.*?\*\)", - eol_comments_re="#.*?$" + comments="\(\*.*?\*\)", + eol_comments="#.*?$" ) Both patterns may also be specified within a grammar using the diff --git a/grammar/tatsu.ebnf b/grammar/tatsu.ebnf index 870caae7..b955d6a2 100644 --- a/grammar/tatsu.ebnf +++ b/grammar/tatsu.ebnf @@ -1,7 +1,7 @@ @@grammar :: TatSu @@whitespace :: /\s+/ @@comments :: ?"(?sm)[(][*](?:.|\n)*?[*][)]" -@@eol_comments :: ?"#[^\n]*$" +@@eol_comments :: ?"(?m)#[^\n]*$" @@parseinfo :: True @@left_recursion :: False diff --git a/tatsu/bootstrap.py b/tatsu/bootstrap.py index 4f656b2a..87c925c7 100644 --- a/tatsu/bootstrap.py +++ b/tatsu/bootstrap.py @@ -35,8 +35,8 @@ def __init__(self, text, /, config: ParserConfig | None = None, **settings): ignorecase=False, namechars='', parseinfo=True, - comments_re='(?sm)[(][*](?:.|\\n)*?[*][)]', - eol_comments_re='#[^\\n]*$', + comments='(?sm)[(][*](?:.|\\n)*?[*][)]', + eol_comments='(?m)#[^\\n]*$', keywords=KEYWORDS, start='start', ) @@ -55,8 +55,8 @@ def __init__(self, /, config: ParserConfig | None = None, **settings): ignorecase=False, namechars='', parseinfo=True, - comments_re='(?sm)[(][*](?:.|\\n)*?[*][)]', - eol_comments_re='#[^\\n]*$', + comments='(?sm)[(][*](?:.|\\n)*?[*][)]', + eol_comments='(?m)#[^\\n]*$', keywords=KEYWORDS, start='start', ) diff --git a/tatsu/buffering.py b/tatsu/buffering.py index 87358d99..5a2a91fd 100644 --- a/tatsu/buffering.py +++ b/tatsu/buffering.py @@ -357,7 +357,7 @@ def _scanre(self, pattern): if isinstance(pattern, RETYPE): cre = pattern else: - cre = re.compile(pattern, re.MULTILINE) + cre = re.compile(pattern) return cre.match(self.text, self.pos) @property diff --git a/tatsu/codegen/objectmodel.py b/tatsu/codegen/objectmodel.py index d52ea9df..bc787f59 100644 --- a/tatsu/codegen/objectmodel.py +++ b/tatsu/codegen/objectmodel.py @@ -67,11 +67,11 @@ def _get_full_name(cls): # Try to reference the class try: idents = name.split('.') - _cls = getattr(module, idents[0]) + cls_ = getattr(module, idents[0]) for ident in idents[1:]: - _cls = getattr(_cls, ident) + cls_ = getattr(cls_, ident) - assert _cls == cls + assert cls_ == cls except AttributeError as e: raise CodegenError( "Couldn't find base type, it has to be importable", diff --git a/tatsu/codegen/python.py b/tatsu/codegen/python.py index 31e0dea9..f25e1d8c 100755 --- a/tatsu/codegen/python.py +++ b/tatsu/codegen/python.py @@ -462,8 +462,8 @@ def render_fields(self, fields): left_recursion = self.node.config.left_recursion parseinfo = self.node.config.parseinfo namechars = repr(self.node.config.namechars or '') - comments_re = repr(self.node.config.comments_re) - eol_comments_re = repr(self.node.config.eol_comments_re) + comments = repr(self.node.config.comments) + eol_comments = repr(self.node.config.eol_comments) rules = '\n'.join( [self.get_renderer(rule).render() for rule in self.node.rules], @@ -488,8 +488,8 @@ def render_fields(self, fields): parseinfo=parseinfo, keywords=keywords, namechars=namechars, - comments_re=comments_re, - eol_comments_re=eol_comments_re, + comments=comments, + eol_comments=eol_comments, ) abstract_rule_template = """ @@ -535,8 +535,8 @@ def __init__(self, text, /, config: ParserConfig | None = None, **settings): ignorecase={ignorecase}, namechars={namechars}, parseinfo={parseinfo}, - comments_re={comments_re}, - eol_comments_re={eol_comments_re}, + comments={comments}, + eol_comments={eol_comments}, keywords=KEYWORDS, start={start!r}, ) @@ -554,8 +554,8 @@ def __init__(self, /, config: ParserConfig | None = None, **settings): ignorecase={ignorecase}, namechars={namechars}, parseinfo={parseinfo}, - comments_re={comments_re}, - eol_comments_re={eol_comments_re}, + comments={comments}, + eol_comments={eol_comments}, left_recursion={left_recursion}, keywords=KEYWORDS, start={start!r}, diff --git a/tatsu/g2e/semantics.py b/tatsu/g2e/semantics.py index 982ed777..ccf0b497 100644 --- a/tatsu/g2e/semantics.py +++ b/tatsu/g2e/semantics.py @@ -9,7 +9,7 @@ def camel2py(name): return re.sub( - '([a-z0-9])([A-Z])', + r'([a-z0-9])([A-Z])', lambda m: m.group(1) + '_' + m.group(2).lower(), name, ) diff --git a/tatsu/grammars.py b/tatsu/grammars.py index 65def8b9..66f2173b 100644 --- a/tatsu/grammars.py +++ b/tatsu/grammars.py @@ -519,7 +519,7 @@ def _to_str(self, lean=False): if multi: return '\n|\n'.join(indent(o) for o in options) - elif len(options) and len(single) > PEP8_LLEN: + elif options and len(single) > PEP8_LLEN: return '| ' + '\n| '.join(o for o in options) else: return single diff --git a/tatsu/infos.py b/tatsu/infos.py index 6ec898ad..3bba14f2 100644 --- a/tatsu/infos.py +++ b/tatsu/infos.py @@ -3,7 +3,7 @@ import copy import dataclasses import re -from collections.abc import Callable, Mapping +from collections.abc import Callable, MutableMapping from itertools import starmap from typing import Any, NamedTuple @@ -30,8 +30,8 @@ class ParserConfig: start_rule: str | None = None # FIXME rule_name: str | None = None # Backward compatibility - comments_re: re.Pattern | None = None - eol_comments_re: re.Pattern | None = None + _comments_re: re.Pattern | None = dataclasses.field(default=None, init=False, repr=False) + _eol_comments_re: re.Pattern | None = dataclasses.field(default=None, init=False, repr=False) tokenizercls: type[Tokenizer] | None = None # FIXME semantics: type | None = None @@ -64,9 +64,17 @@ def __post_init__(self): # pylint: disable=W0235 if self.ignorecase: self.keywords = [k.upper() for k in self.keywords] if self.comments: - self.comments_re = re.compile(self.comments) + self._comments_re = re.compile(self.comments) if self.eol_comments: - self.eol_comments_re = re.compile(self.eol_comments) + self._eol_comments_re = re.compile(self.eol_comments) + + @property + def comments_re(self) -> re.Pattern | None: + return self._comments_re + + @property + def eol_comments_re(self) -> re.Pattern | None: + return self._eol_comments_re @classmethod def new( @@ -84,7 +92,7 @@ def effective_rule_name(self): # note: there are legacy reasons for this mess return self.start_rule or self.rule_name or self.start - def _find_common(self, **settings: Any) -> Mapping[str, Any]: + def _find_common(self, **settings: Any) -> MutableMapping[str, Any]: return { name: value for name, value in settings.items() @@ -101,8 +109,20 @@ def replace_config( else: return self.replace(**vars(other)) + # non-init fields cannot be used as arguments in `replace`, however + # they are values returned by `vars` and `dataclass.asdict` so they + # must be filtered out. + # If the `ParserConfig` dataclass drops these fields, then this filter can be removed + def _filter_non_init_fields(self, settings: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + for field in [ + field.name for field in dataclasses.fields(self) if not field.init + ]: + if field in settings: + del settings[field] + return settings + def replace(self, **settings: Any) -> ParserConfig: - overrides = self._find_common(**settings) + overrides = self._filter_non_init_fields(self._find_common(**settings)) result = dataclasses.replace(self, **overrides) if 'grammar' in overrides: result.name = result.grammar diff --git a/tatsu/ngcodegen/python.py b/tatsu/ngcodegen/python.py index 6a83e5c5..76583377 100644 --- a/tatsu/ngcodegen/python.py +++ b/tatsu/ngcodegen/python.py @@ -323,8 +323,8 @@ def _gen_init(self, grammar: grammars.Grammar): ignorecase={grammar.config.ignorecase}, namechars={grammar.config.namechars!r}, parseinfo={grammar.config.parseinfo}, - comments_re={grammar.config.comments_re!r}, - eol_comments_re={grammar.config.eol_comments_re!r}, + comments={grammar.config.comments!r}, + eol_comments={grammar.config.eol_comments!r}, keywords=KEYWORDS, start={start!r}, ) diff --git a/tatsu/util/_common.py b/tatsu/util/_common.py index c0819064..1123e9fb 100644 --- a/tatsu/util/_common.py +++ b/tatsu/util/_common.py @@ -27,7 +27,7 @@ logger.addHandler(ch) -RETYPE = type(re.compile('.')) +RETYPE = re.Pattern ESCAPE_SEQUENCE_RE = re.compile( diff --git a/tatsu/walkers.py b/tatsu/walkers.py index 3de070ea..7762a4d7 100644 --- a/tatsu/walkers.py +++ b/tatsu/walkers.py @@ -74,7 +74,7 @@ def pythonize_match(m): # walk__pythonic_name with double underscore after walk pythonic_name = re.sub( - '[A-Z]+', pythonize_match, node_cls.__name__, + r'[A-Z]+', pythonize_match, node_cls.__name__, ) if pythonic_name != cammelcase_name: walker = getattr(cls, prefix + pythonic_name, None) diff --git a/test/grammar/pattern_test.py b/test/grammar/pattern_test.py index 91094fad..c651baf3 100644 --- a/test/grammar/pattern_test.py +++ b/test/grammar/pattern_test.py @@ -22,7 +22,7 @@ def test_patterns_with_newlines(self): blankline = - /^[^\\n]*\\n$/ + /(?m)^[^\\n]*\\n$/ ; """ diff --git a/test/grammar/syntax_test.py b/test/grammar/syntax_test.py index b59b7bdf..f111a92b 100644 --- a/test/grammar/syntax_test.py +++ b/test/grammar/syntax_test.py @@ -352,7 +352,7 @@ def test_parse_hash(): start = '#' ; """ - parser = compile(grammar, eol_comments_re='') + parser = compile(grammar, eol_comments='') parser.parse('#', trace=True) diff --git a/test/parser_equivalence_test.py b/test/parser_equivalence_test.py index 02b4367f..62c6eaa1 100644 --- a/test/parser_equivalence_test.py +++ b/test/parser_equivalence_test.py @@ -171,6 +171,7 @@ def test_none_whitespace(): output = parser.parse(input, parseinfo=False) assert output == ('This is a', ' test') + def test_sep_join(): grammar = """ @@grammar::numbers @@ -183,9 +184,7 @@ def test_sep_join(): = ~ ( "," )%{ digit }+ ; - digit = /\d+/ ; + digit = /\\d+/ ; """ parser = generate_and_load_parser('W', grammar) - ast = parser.parse('1,2,3,4', nameguard=False) - - + parser.parse('1,2,3,4', nameguard=False)