Skip to content

Commit

Permalink
Make {eol_}comments_re read-only and non-init arguments in `ParserCon… (
Browse files Browse the repository at this point in the history
#353)

* Deprecate ` {eol_}comments_re` in `ParserConfig` (#352)
* [buffering] drop forced multiline match for string patterns

Previously, when scanning for matches to a regex, if the type of the
pattern was `str`, the pattern was always compiled with `re.MULTILINE`.

Recent changes to `ParserConfig` [0] changed the type used for regex
matches in generated code from `str` to `re.Pattern` which could lead to
a difference in behavior from previous versions where a defined comments
or eol_comments may have been implicitly relying on the `re.MULTILINE`
flag.

After discussion [1], it has been determined that usage of `re` flags
within TatSu should be deprecated in favor of users specifying the
necessary flags within patterns.

As such, drop the `re.MULTILINE` flag for strings compiled on the fly.

---------

Co-authored-by: Vincent Fazio <[email protected]>
Co-authored-by: Vincent Fazio <[email protected]>
  • Loading branch information
3 people authored Jan 3, 2025
1 parent 1d994c5 commit 0240667
Show file tree
Hide file tree
Showing 10 changed files with 98 additions and 35 deletions.
4 changes: 4 additions & 0 deletions docs/directives.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ Specifies a regular expression to identify and exclude inline (bracketed) commen
@@comments :: /\(\*((?:.|\n)*?)\*\)/
.. note::
Prior to 5.12.1, comments implicitly had the `(?m) <https://docs.python.org/3/library/re.html#re.MULTILINE>`_ option defined. This is no longer the case.

``@@eol_comments :: <regexp>``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand All @@ -39,6 +41,8 @@ Specifies a regular expression to identify and exclude end-of-line comments befo
@@eol_comments :: /#([^\n]*?)$/
.. note::
Prior to 5.12.1, eol_comments implicitly had the `(?m) <https://docs.python.org/3/library/re.html#re.MULTILINE>`_ option defined. This is no longer the case.

``@@ignorecase :: <bool>``
~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
8 changes: 4 additions & 4 deletions docs/syntax.rst
Original file line number Diff line number Diff line change
Expand Up @@ -735,11 +735,11 @@ Comments
~~~~~~~~

Parsers will skip over comments specified as a regular expression using
the ``comments_re`` parameter:
the ``comments`` parameter:

.. code:: python
parser = MyParser(text, comments_re="\(\*.*?\*\)")
parser = MyParser(text, comments="\(\*.*?\*\)")
For more complex comment handling, you can override the
``Buffer.eat_comments()`` method.
Expand All @@ -751,8 +751,8 @@ comments separately:
parser = MyParser(
text,
comments_re="\(\*.*?\*\)",
eol_comments_re="#.*?$"
comments="\(\*.*?\*\)",
eol_comments="#.*?$"
)
Both patterns may also be specified within a grammar using the
Expand Down
4 changes: 2 additions & 2 deletions grammar/tatsu.ebnf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
@@grammar :: TatSu
@@whitespace :: /\s+/
@@whitespace :: /(?m)\s+/
@@comments :: ?"(?sm)[(][*](?:.|\n)*?[*][)]"
@@eol_comments :: ?"#[^\n]*$"
@@eol_comments :: ?"(?m)#[^\n]*$"
@@parseinfo :: True
@@left_recursion :: False
Expand Down
10 changes: 5 additions & 5 deletions tatsu/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@ def __init__(self, text, /, config: ParserConfig | None = None, **settings):
config = ParserConfig.new(
config,
owner=self,
whitespace='\\s+',
whitespace='(?m)\\s+',
nameguard=None,
ignorecase=False,
namechars='',
parseinfo=True,
comments_re='(?sm)[(][*](?:.|\\n)*?[*][)]',
eol_comments_re='#[^\\n]*$',
comments='(?sm)[(][*](?:.|\\n)*?[*][)]',
eol_comments='(?sm)*#[^\\n]*$',
keywords=KEYWORDS,
start='start',
)
Expand All @@ -55,8 +55,8 @@ def __init__(self, /, config: ParserConfig | None = None, **settings):
ignorecase=False,
namechars='',
parseinfo=True,
comments_re='(?sm)[(][*](?:.|\\n)*?[*][)]',
eol_comments_re='#[^\\n]*$',
comments='(?sm)[(][*](?:.|\\n)*?[*][)]',
eol_comments='(?m)#[^\\n]*$',
keywords=KEYWORDS,
start='start',
)
Expand Down
6 changes: 3 additions & 3 deletions tatsu/buffering.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,11 +268,11 @@ def eat_whitespace(self):
return self._eat_regex(self.whitespace_re)

def eat_comments(self):
comments = self._eat_regex_list(self.config.comments_re)
comments = self._eat_regex_list(self.config.comments)
self._index_comments(comments, lambda x: x.inline)

def eat_eol_comments(self):
comments = self._eat_regex_list(self.config.eol_comments_re)
comments = self._eat_regex_list(self.config.eol_comments)
self._index_comments(comments, lambda x: x.eol)

def next_token(self):
Expand Down Expand Up @@ -356,7 +356,7 @@ def _scanre(self, pattern):
if isinstance(pattern, re.Pattern):
cre = pattern
else:
cre = re.compile(pattern, re.MULTILINE)
cre = re.compile(pattern)
return cre.match(self.text, self.pos)

@property
Expand Down
16 changes: 8 additions & 8 deletions tatsu/codegen/python.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,8 +463,8 @@ def render_fields(self, fields):
left_recursion = self.node.config.left_recursion
parseinfo = self.node.config.parseinfo
namechars = repr(self.node.config.namechars or '')
comments_re = repr(self.node.config.comments_re)
eol_comments_re = repr(self.node.config.eol_comments_re)
comments = repr(self.node.config.comments)
eol_comments = repr(self.node.config.eol_comments)

rules = '\n'.join(
[self.get_renderer(rule).render() for rule in self.node.rules],
Expand All @@ -489,8 +489,8 @@ def render_fields(self, fields):
parseinfo=parseinfo,
keywords=keywords,
namechars=namechars,
comments_re=comments_re,
eol_comments_re=eol_comments_re,
comments=comments,
eol_comments=eol_comments,
)

abstract_rule_template = """
Expand Down Expand Up @@ -536,8 +536,8 @@ def __init__(self, text, /, config: ParserConfig | None = None, **settings):
ignorecase={ignorecase},
namechars={namechars},
parseinfo={parseinfo},
comments_re={comments_re},
eol_comments_re={eol_comments_re},
comments={comments},
eol_comments={eol_comments},
keywords=KEYWORDS,
start={start!r},
)
Expand All @@ -555,8 +555,8 @@ def __init__(self, /, config: ParserConfig | None = None, **settings):
ignorecase={ignorecase},
namechars={namechars},
parseinfo={parseinfo},
comments_re={comments_re},
eol_comments_re={eol_comments_re},
comments={comments},
eol_comments={eol_comments},
left_recursion={left_recursion},
keywords=KEYWORDS,
start={start!r},
Expand Down
34 changes: 25 additions & 9 deletions tatsu/infos.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import copy
import dataclasses
import re
from collections.abc import Callable, Mapping
from collections.abc import Callable, MutableMapping
from itertools import starmap
from typing import Any, NamedTuple

Expand All @@ -30,8 +30,8 @@ class ParserConfig:
start_rule: str | None = None # FIXME
rule_name: str | None = None # Backward compatibility

comments_re: re.Pattern | None = None
eol_comments_re: re.Pattern | None = None
comments_re: re.Pattern | str | None = None
eol_comments_re: re.Pattern | str | None = None

tokenizercls: type[Tokenizer] | None = None # FIXME
semantics: type | None = None
Expand Down Expand Up @@ -63,10 +63,14 @@ class ParserConfig:
def __post_init__(self): # pylint: disable=W0235
if self.ignorecase:
self.keywords = [k.upper() for k in self.keywords]
if self.comments:
self.comments_re = re.compile(self.comments)
if self.eol_comments:
self.eol_comments_re = re.compile(self.eol_comments)

if self.comments_re or self.eol_comments_re:
raise AttributeError("""\
Both `comments_re` and `eol_comments_re` have been removed from parser configuration.
Please use `comments` and/or `eol_comments` instead`.
""")
del self.comments_re
del self.eol_comments_re

@classmethod
def new(
Expand All @@ -84,7 +88,7 @@ def effective_rule_name(self):
# note: there are legacy reasons for this mess
return self.start_rule or self.rule_name or self.start

def _find_common(self, **settings: Any) -> Mapping[str, Any]:
def _find_common(self, **settings: Any) -> MutableMapping[str, Any]:
return {
name: value
for name, value in settings.items()
Expand All @@ -101,8 +105,20 @@ def replace_config(
else:
return self.replace(**vars(other))

# non-init fields cannot be used as arguments in `replace`, however
# they are values returned by `vars` and `dataclass.asdict` so they
# must be filtered out.
# If the `ParserConfig` dataclass drops these fields, then this filter can be removed
def _filter_non_init_fields(self, settings: MutableMapping[str, Any]) -> MutableMapping[str, Any]:
for field in [
field.name for field in dataclasses.fields(self) if not field.init
]:
if field in settings:
del settings[field]
return settings

def replace(self, **settings: Any) -> ParserConfig:
overrides = self._find_common(**settings)
overrides = self._filter_non_init_fields(self._find_common(**settings))
result = dataclasses.replace(self, **overrides)
if 'grammar' in overrides:
result.name = result.grammar
Expand Down
4 changes: 2 additions & 2 deletions tatsu/ngcodegen/python.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,8 +323,8 @@ def _gen_init(self, grammar: grammars.Grammar):
ignorecase={grammar.config.ignorecase},
namechars={grammar.config.namechars!r},
parseinfo={grammar.config.parseinfo},
comments_re={grammar.config.comments_re!r},
eol_comments_re={grammar.config.eol_comments_re!r},
comments={grammar.config.comments!r},
eol_comments={grammar.config.eol_comments!r},
keywords=KEYWORDS,
start={start!r},
)
Expand Down
2 changes: 1 addition & 1 deletion test/grammar/pattern_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def test_patterns_with_newlines(self):
blankline
=
/^[^\\n]*\\n$/
/(?m)^[^\\n]*\\n$/
;
"""

Expand Down
45 changes: 44 additions & 1 deletion test/grammar/syntax_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ def test_parse_hash():
start = '#' ;
"""

parser = compile(grammar, eol_comments_re='')
parser = compile(grammar, eol_comments='')
parser.parse('#', trace=True)


Expand All @@ -377,3 +377,46 @@ def test_no_default_comments():
"""
with pytest.raises(FailedToken):
tool.parse(grammar, text)


import re


@pytest.mark.parametrize(
"comment,option",
[
pytest.param(
"# This comment should be stripped",
{
"eol_comments_re": re.compile(r"(?m)#.*?$"),
"eol_comments": r"(?m)#.*?$",
},
id="eol_comments override",
),
pytest.param(
"(* This comment should be stripped *)",
{
"comments_re": re.compile(r"(?sm)[(][*](?:.|\n)*?[*][)]"),
"comments": r"(?sm)[(][*](?:.|\n)*?[*][)]",
},
id="comments override",
),
],
)
def test_deprecated_comments_override_failures(comment, option):
"""
# TODO: remove this test after {eol_}comments_re are no longer supported
"""
grammar = """
@@comments :: /@@@@@@/
@@eol_comments :: /@@@@@@/
start = 'a' $;
"""

text = f"""
{comment}
a
"""
with pytest.raises(AttributeError, match=""):
tool.parse(grammar, text, **option)

0 comments on commit 0240667

Please sign in to comment.