From 02406677997eaaffbaf708a3b4005604b2839c19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juancarlo=20A=C3=B1ez?= Date: Fri, 3 Jan 2025 08:34:33 -0400 Subject: [PATCH] =?UTF-8?q?Make=20{eol=5F}comments=5Fre=20read-only=20and?= =?UTF-8?q?=20non-init=20arguments=20in=20`ParserCon=E2=80=A6=20(#353)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Deprecate ` {eol_}comments_re` in `ParserConfig` (#352) * [buffering] drop forced multiline match for string patterns Previously, when scanning for matches to a regex, if the type of the pattern was `str`, the pattern was always compiled with `re.MULTILINE`. Recent changes to `ParserConfig` [0] changed the type used for regex matches in generated code from `str` to `re.Pattern` which could lead to a difference in behavior from previous versions where a defined comments or eol_comments may have been implicitly relying on the `re.MULTILINE` flag. After discussion [1], it has been determined that usage of `re` flags within TatSu should be deprecated in favor of users specifying the necessary flags within patterns. As such, drop the `re.MULTILINE` flag for strings compiled on the fly. --------- Co-authored-by: Vincent Fazio Co-authored-by: Vincent Fazio --- docs/directives.rst | 4 ++++ docs/syntax.rst | 8 +++---- grammar/tatsu.ebnf | 4 ++-- tatsu/bootstrap.py | 10 ++++---- tatsu/buffering.py | 6 ++--- tatsu/codegen/python.py | 16 ++++++------- tatsu/infos.py | 34 +++++++++++++++++++-------- tatsu/ngcodegen/python.py | 4 ++-- test/grammar/pattern_test.py | 2 +- test/grammar/syntax_test.py | 45 +++++++++++++++++++++++++++++++++++- 10 files changed, 98 insertions(+), 35 deletions(-) diff --git a/docs/directives.rst b/docs/directives.rst index 82852984..4a765dfe 100644 --- a/docs/directives.rst +++ b/docs/directives.rst @@ -29,6 +29,8 @@ Specifies a regular expression to identify and exclude inline (bracketed) commen @@comments :: /\(\*((?:.|\n)*?)\*\)/ +.. note:: + Prior to 5.12.1, comments implicitly had the `(?m) `_ option defined. This is no longer the case. ``@@eol_comments :: `` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -39,6 +41,8 @@ Specifies a regular expression to identify and exclude end-of-line comments befo @@eol_comments :: /#([^\n]*?)$/ +.. note:: + Prior to 5.12.1, eol_comments implicitly had the `(?m) `_ option defined. This is no longer the case. ``@@ignorecase :: `` ~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/syntax.rst b/docs/syntax.rst index 1a5ad10c..b1dcc603 100644 --- a/docs/syntax.rst +++ b/docs/syntax.rst @@ -735,11 +735,11 @@ Comments ~~~~~~~~ Parsers will skip over comments specified as a regular expression using -the ``comments_re`` parameter: +the ``comments`` parameter: .. code:: python - parser = MyParser(text, comments_re="\(\*.*?\*\)") + parser = MyParser(text, comments="\(\*.*?\*\)") For more complex comment handling, you can override the ``Buffer.eat_comments()`` method. @@ -751,8 +751,8 @@ comments separately: parser = MyParser( text, - comments_re="\(\*.*?\*\)", - eol_comments_re="#.*?$" + comments="\(\*.*?\*\)", + eol_comments="#.*?$" ) Both patterns may also be specified within a grammar using the diff --git a/grammar/tatsu.ebnf b/grammar/tatsu.ebnf index 870caae7..3cb48d51 100644 --- a/grammar/tatsu.ebnf +++ b/grammar/tatsu.ebnf @@ -1,7 +1,7 @@ @@grammar :: TatSu -@@whitespace :: /\s+/ +@@whitespace :: /(?m)\s+/ @@comments :: ?"(?sm)[(][*](?:.|\n)*?[*][)]" -@@eol_comments :: ?"#[^\n]*$" +@@eol_comments :: ?"(?m)#[^\n]*$" @@parseinfo :: True @@left_recursion :: False diff --git a/tatsu/bootstrap.py b/tatsu/bootstrap.py index 4f656b2a..84e5205c 100644 --- a/tatsu/bootstrap.py +++ b/tatsu/bootstrap.py @@ -30,13 +30,13 @@ def __init__(self, text, /, config: ParserConfig | None = None, **settings): config = ParserConfig.new( config, owner=self, - whitespace='\\s+', + whitespace='(?m)\\s+', nameguard=None, ignorecase=False, namechars='', parseinfo=True, - comments_re='(?sm)[(][*](?:.|\\n)*?[*][)]', - eol_comments_re='#[^\\n]*$', + comments='(?sm)[(][*](?:.|\\n)*?[*][)]', + eol_comments='(?sm)*#[^\\n]*$', keywords=KEYWORDS, start='start', ) @@ -55,8 +55,8 @@ def __init__(self, /, config: ParserConfig | None = None, **settings): ignorecase=False, namechars='', parseinfo=True, - comments_re='(?sm)[(][*](?:.|\\n)*?[*][)]', - eol_comments_re='#[^\\n]*$', + comments='(?sm)[(][*](?:.|\\n)*?[*][)]', + eol_comments='(?m)#[^\\n]*$', keywords=KEYWORDS, start='start', ) diff --git a/tatsu/buffering.py b/tatsu/buffering.py index bf3a58a7..81f12ba2 100644 --- a/tatsu/buffering.py +++ b/tatsu/buffering.py @@ -268,11 +268,11 @@ def eat_whitespace(self): return self._eat_regex(self.whitespace_re) def eat_comments(self): - comments = self._eat_regex_list(self.config.comments_re) + comments = self._eat_regex_list(self.config.comments) self._index_comments(comments, lambda x: x.inline) def eat_eol_comments(self): - comments = self._eat_regex_list(self.config.eol_comments_re) + comments = self._eat_regex_list(self.config.eol_comments) self._index_comments(comments, lambda x: x.eol) def next_token(self): @@ -356,7 +356,7 @@ def _scanre(self, pattern): if isinstance(pattern, re.Pattern): cre = pattern else: - cre = re.compile(pattern, re.MULTILINE) + cre = re.compile(pattern) return cre.match(self.text, self.pos) @property diff --git a/tatsu/codegen/python.py b/tatsu/codegen/python.py index d1d2fdcd..ecd0c390 100755 --- a/tatsu/codegen/python.py +++ b/tatsu/codegen/python.py @@ -463,8 +463,8 @@ def render_fields(self, fields): left_recursion = self.node.config.left_recursion parseinfo = self.node.config.parseinfo namechars = repr(self.node.config.namechars or '') - comments_re = repr(self.node.config.comments_re) - eol_comments_re = repr(self.node.config.eol_comments_re) + comments = repr(self.node.config.comments) + eol_comments = repr(self.node.config.eol_comments) rules = '\n'.join( [self.get_renderer(rule).render() for rule in self.node.rules], @@ -489,8 +489,8 @@ def render_fields(self, fields): parseinfo=parseinfo, keywords=keywords, namechars=namechars, - comments_re=comments_re, - eol_comments_re=eol_comments_re, + comments=comments, + eol_comments=eol_comments, ) abstract_rule_template = """ @@ -536,8 +536,8 @@ def __init__(self, text, /, config: ParserConfig | None = None, **settings): ignorecase={ignorecase}, namechars={namechars}, parseinfo={parseinfo}, - comments_re={comments_re}, - eol_comments_re={eol_comments_re}, + comments={comments}, + eol_comments={eol_comments}, keywords=KEYWORDS, start={start!r}, ) @@ -555,8 +555,8 @@ def __init__(self, /, config: ParserConfig | None = None, **settings): ignorecase={ignorecase}, namechars={namechars}, parseinfo={parseinfo}, - comments_re={comments_re}, - eol_comments_re={eol_comments_re}, + comments={comments}, + eol_comments={eol_comments}, left_recursion={left_recursion}, keywords=KEYWORDS, start={start!r}, diff --git a/tatsu/infos.py b/tatsu/infos.py index 0efb982f..c201dd25 100644 --- a/tatsu/infos.py +++ b/tatsu/infos.py @@ -3,7 +3,7 @@ import copy import dataclasses import re -from collections.abc import Callable, Mapping +from collections.abc import Callable, MutableMapping from itertools import starmap from typing import Any, NamedTuple @@ -30,8 +30,8 @@ class ParserConfig: start_rule: str | None = None # FIXME rule_name: str | None = None # Backward compatibility - comments_re: re.Pattern | None = None - eol_comments_re: re.Pattern | None = None + comments_re: re.Pattern | str | None = None + eol_comments_re: re.Pattern | str | None = None tokenizercls: type[Tokenizer] | None = None # FIXME semantics: type | None = None @@ -63,10 +63,14 @@ class ParserConfig: def __post_init__(self): # pylint: disable=W0235 if self.ignorecase: self.keywords = [k.upper() for k in self.keywords] - if self.comments: - self.comments_re = re.compile(self.comments) - if self.eol_comments: - self.eol_comments_re = re.compile(self.eol_comments) + + if self.comments_re or self.eol_comments_re: + raise AttributeError("""\ + Both `comments_re` and `eol_comments_re` have been removed from parser configuration. + Please use `comments` and/or `eol_comments` instead`. + """) + del self.comments_re + del self.eol_comments_re @classmethod def new( @@ -84,7 +88,7 @@ def effective_rule_name(self): # note: there are legacy reasons for this mess return self.start_rule or self.rule_name or self.start - def _find_common(self, **settings: Any) -> Mapping[str, Any]: + def _find_common(self, **settings: Any) -> MutableMapping[str, Any]: return { name: value for name, value in settings.items() @@ -101,8 +105,20 @@ def replace_config( else: return self.replace(**vars(other)) + # non-init fields cannot be used as arguments in `replace`, however + # they are values returned by `vars` and `dataclass.asdict` so they + # must be filtered out. + # If the `ParserConfig` dataclass drops these fields, then this filter can be removed + def _filter_non_init_fields(self, settings: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + for field in [ + field.name for field in dataclasses.fields(self) if not field.init + ]: + if field in settings: + del settings[field] + return settings + def replace(self, **settings: Any) -> ParserConfig: - overrides = self._find_common(**settings) + overrides = self._filter_non_init_fields(self._find_common(**settings)) result = dataclasses.replace(self, **overrides) if 'grammar' in overrides: result.name = result.grammar diff --git a/tatsu/ngcodegen/python.py b/tatsu/ngcodegen/python.py index 6a83e5c5..76583377 100644 --- a/tatsu/ngcodegen/python.py +++ b/tatsu/ngcodegen/python.py @@ -323,8 +323,8 @@ def _gen_init(self, grammar: grammars.Grammar): ignorecase={grammar.config.ignorecase}, namechars={grammar.config.namechars!r}, parseinfo={grammar.config.parseinfo}, - comments_re={grammar.config.comments_re!r}, - eol_comments_re={grammar.config.eol_comments_re!r}, + comments={grammar.config.comments!r}, + eol_comments={grammar.config.eol_comments!r}, keywords=KEYWORDS, start={start!r}, ) diff --git a/test/grammar/pattern_test.py b/test/grammar/pattern_test.py index 91094fad..c651baf3 100644 --- a/test/grammar/pattern_test.py +++ b/test/grammar/pattern_test.py @@ -22,7 +22,7 @@ def test_patterns_with_newlines(self): blankline = - /^[^\\n]*\\n$/ + /(?m)^[^\\n]*\\n$/ ; """ diff --git a/test/grammar/syntax_test.py b/test/grammar/syntax_test.py index b59b7bdf..e63c511d 100644 --- a/test/grammar/syntax_test.py +++ b/test/grammar/syntax_test.py @@ -352,7 +352,7 @@ def test_parse_hash(): start = '#' ; """ - parser = compile(grammar, eol_comments_re='') + parser = compile(grammar, eol_comments='') parser.parse('#', trace=True) @@ -377,3 +377,46 @@ def test_no_default_comments(): """ with pytest.raises(FailedToken): tool.parse(grammar, text) + + +import re + + +@pytest.mark.parametrize( + "comment,option", + [ + pytest.param( + "# This comment should be stripped", + { + "eol_comments_re": re.compile(r"(?m)#.*?$"), + "eol_comments": r"(?m)#.*?$", + }, + id="eol_comments override", + ), + pytest.param( + "(* This comment should be stripped *)", + { + "comments_re": re.compile(r"(?sm)[(][*](?:.|\n)*?[*][)]"), + "comments": r"(?sm)[(][*](?:.|\n)*?[*][)]", + }, + id="comments override", + ), + ], +) +def test_deprecated_comments_override_failures(comment, option): + """ + # TODO: remove this test after {eol_}comments_re are no longer supported + """ + grammar = """ + @@comments :: /@@@@@@/ + @@eol_comments :: /@@@@@@/ + + start = 'a' $; + """ + + text = f""" + {comment} + a + """ + with pytest.raises(AttributeError, match=""): + tool.parse(grammar, text, **option)