From 8194ea8991c19232b682334d8c6c9fa68df5f432 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Tue, 20 May 2025 13:38:53 -0700 Subject: [PATCH 1/2] Change delphin.tdl from module to package It should be backwards-compatible and imports can work as before. --- CHANGELOG.md | 5 + delphin/tdl/__init__.py | 71 +++ delphin/tdl/_exceptions.py | 17 + delphin/tdl/_format.py | 298 +++++++++++ delphin/{tdl.py => tdl/_model.py} | 788 +----------------------------- delphin/tdl/_parse.py | 519 ++++++++++++++++++++ 6 files changed, 912 insertions(+), 786 deletions(-) create mode 100644 delphin/tdl/__init__.py create mode 100644 delphin/tdl/_exceptions.py create mode 100644 delphin/tdl/_format.py rename delphin/{tdl.py => tdl/_model.py} (51%) create mode 100644 delphin/tdl/_parse.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d780c818..db06f597 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -49,6 +49,11 @@ * `tdl.AVM` initialization uses `AVM.aggregate()` instead of assignment of values on features ([#395]) +### Maintenance + +* Changed `delphin.tdl` from a simple module to a backwards-compatible + package + ## [v1.9.1] diff --git a/delphin/tdl/__init__.py b/delphin/tdl/__init__.py new file mode 100644 index 00000000..903bed42 --- /dev/null +++ b/delphin/tdl/__init__.py @@ -0,0 +1,71 @@ +__all__ = [ + 'AVM', + 'DIFF_LIST_LAST', + 'DIFF_LIST_LIST', + 'EMPTY_LIST_TYPE', + 'LIST_HEAD', + 'LIST_TAIL', + 'LIST_TYPE', + 'BlockComment', + 'Conjunction', + 'ConsList', + 'Coreference', + 'DiffList', + 'FileInclude', + 'InstanceEnvironment', + 'LetterSet', + 'LexicalRuleDefinition', + 'LineComment', + 'Regex', + 'String', + 'TDLError', + 'TDLSyntaxError', + 'TDLWarning', + 'Term', + 'TypeAddendum', + 'TypeDefinition', + 'TypeEnvironment', + 'TypeIdentifier', + 'WildCard', + # '_Environment', + # '_MorphSet', + 'format', + 'iterparse', +] + +from delphin.tdl._exceptions import ( + TDLError, + TDLSyntaxError, + TDLWarning, +) +from delphin.tdl._format import format +from delphin.tdl._model import ( + AVM, + DIFF_LIST_LAST, + DIFF_LIST_LIST, + EMPTY_LIST_TYPE, + LIST_HEAD, + LIST_TAIL, + LIST_TYPE, + BlockComment, + Conjunction, + ConsList, + Coreference, + DiffList, + FileInclude, + InstanceEnvironment, + LetterSet, + LexicalRuleDefinition, + LineComment, + Regex, + String, + Term, + TypeAddendum, + TypeDefinition, + TypeEnvironment, + TypeIdentifier, + WildCard, + # _Environment, + # _MorphSet, +) +from delphin.tdl._parse import iterparse # parse, decode diff --git a/delphin/tdl/_exceptions.py b/delphin/tdl/_exceptions.py new file mode 100644 index 00000000..d8e23646 --- /dev/null +++ b/delphin/tdl/_exceptions.py @@ -0,0 +1,17 @@ +from delphin.exceptions import ( + PyDelphinException, + PyDelphinSyntaxError, + PyDelphinWarning, +) + + +class TDLError(PyDelphinException): + """Raised when there is an error in processing TDL.""" + + +class TDLSyntaxError(PyDelphinSyntaxError): + """Raised when parsing TDL text fails.""" + + +class TDLWarning(PyDelphinWarning): + """Raised when parsing unsupported TDL features.""" diff --git a/delphin/tdl/_format.py b/delphin/tdl/_format.py new file mode 100644 index 00000000..3b8679ca --- /dev/null +++ b/delphin/tdl/_format.py @@ -0,0 +1,298 @@ +import textwrap + +from delphin.tdl._exceptions import TDLError +from delphin.tdl._model import ( + AVM, + BlockComment, + Conjunction, + ConsList, + Coreference, + DiffList, + FileInclude, + InstanceEnvironment, + LetterSet, + LineComment, + Regex, + String, + Term, + TypeDefinition, + TypeEnvironment, + TypeIdentifier, + WildCard, + _Environment, + _ImplicitAVM, + _MorphSet, +) + +# Values for serialization +_base_indent = 2 # indent when an AVM starts on the next line +_max_inline_list_items = 3 # number of list items that may appear inline +_line_width = 79 # try not to go beyond this number of characters + +# Serialization helpers + +def format(obj, indent=0): + """ + Serialize TDL objects to strings. + + Args: + obj: instance of :class:`Term`, :class:`Conjunction`, or + :class:`TypeDefinition` classes or subclasses + indent (int): number of spaces to indent the formatted object + Returns: + str: serialized form of *obj* + Example: + >>> conj = tdl.Conjunction([ + ... tdl.TypeIdentifier('lex-item'), + ... tdl.AVM([('SYNSEM.LOCAL.CAT.HEAD.MOD', + ... tdl.ConsList(end=tdl.EMPTY_LIST_TYPE))]) + ... ]) + >>> t = tdl.TypeDefinition('non-mod-lex-item', conj) + >>> print(format(t)) + non-mod-lex-item := lex-item & + [ SYNSEM.LOCAL.CAT.HEAD.MOD < > ]. + """ + if isinstance(obj, TypeDefinition): + return _format_typedef(obj, indent) + elif isinstance(obj, Conjunction): + return _format_conjunction(obj, indent) + elif isinstance(obj, Term): + return _format_term(obj, indent) + elif isinstance(obj, _MorphSet): + return _format_morphset(obj, indent) + elif isinstance(obj, _Environment): + return _format_environment(obj, indent) + elif isinstance(obj, FileInclude): + return _format_include(obj, indent) + elif isinstance(obj, LineComment): + return _format_linecomment(obj, indent) + elif isinstance(obj, BlockComment): + return _format_blockcomment(obj, indent) + else: + raise ValueError(f'cannot format object as TDL: {obj!r}') + + +def _format_term(term, indent): + fmt = { + TypeIdentifier: _format_id, + String: _format_string, + Regex: _format_regex, + Coreference: _format_coref, + AVM: _format_avm, + _ImplicitAVM: _format_avm, + ConsList: _format_conslist, + DiffList: _format_difflist, + }.get(term.__class__, None) + + if fmt is None: + raise TDLError('not a valid term: {}' + .format(type(term).__name__)) + + if term.docstring is not None: + return '{}\n{}{}'.format( + _format_docstring(term.docstring, indent), + ' ' * indent, + fmt(term, indent)) + else: + return fmt(term, indent) + + +def _format_id(term, indent): + return str(term) + + +def _format_string(term, indent): + return f'"{term!s}"' + + +def _format_regex(term, indent): + return f'^{term!s}$' + + +def _format_coref(term, indent): + return f'#{term!s}' + + +def _format_avm(avm, indent): + lines = [] + for feat, val in avm.features(): + val = _format_conjunction(val, indent + len(feat) + 3) + if not val.startswith('\n'): + feat += ' ' + lines.append(feat + val) + if not lines: + return '[ ]' + else: + return '[ {} ]'.format((',\n' + ' ' * (indent + 2)).join(lines)) + + +def _format_conslist(cl, indent): + values = [_format_conjunction(val, indent + 2) # 2 = len('< ') + for val in cl.values()] + end = '' + if not cl.terminated: + if values: + end = ', ...' + else: + values = ['...'] + elif cl._avm is not None and cl[cl._last_path] is not None: + end = ' . ' + values[-1] + values = values[:-1] + + if not values: # only if no values and terminated + return '< >' + elif (len(values) <= _max_inline_list_items + and sum(len(v) + 2 for v in values) + 2 + indent <= _line_width): + return '< {} >'.format(', '.join(values) + end) + else: + i = ' ' * (indent + 2) # 2 = len('< ') + lines = [f'< {values[0]}'] + lines.extend(i + val for val in values[1:]) + return ',\n'.join(lines) + end + ' >' + + +def _format_difflist(dl, indent): + values = [_format_conjunction(val, indent + 3) # 3 == len('' + elif (len(values) <= _max_inline_list_items + and sum(len(v) + 2 for v in values) + 4 + indent <= _line_width): + return ''.format(', '.join(values)) + else: + # i = ' ' * (indent + 3) # 3 == len(''.format( + (',\n' + ' ' * (indent + 3)).join(values)) + # values[0])] + # lines.extend(i + val for val in values[1:]) + # return ',\n'.join(lines) + ' !>' + + +def _format_conjunction(conj, indent): + if isinstance(conj, Term): + return _format_term(conj, indent) + elif len(conj._terms) == 0: + return '' + else: + tokens = [] + width = indent + for term in conj._terms: + tok = _format_term(term, width) + flen = max(len(s) for s in tok.splitlines()) + width += flen + 3 # 3 == len(' & ') + tokens.append(tok) + lines = [tokens] # all terms joined without newlines (for now) + return (' &\n' + ' ' * indent).join( + ' & '.join(line) for line in lines if line) + + +def _format_typedef(td, indent): + i = ' ' * indent + if hasattr(td, 'affix_type'): + patterns = ' '.join(f'({a} {b})' for a, b in td.patterns) + body = _format_typedef_body(td, indent, indent + 2) + return '{}{} {}\n%{} {}\n {}.'.format( + i, td.identifier, td._operator, td.affix_type, patterns, body) + else: + body = _format_typedef_body( + td, indent, indent + len(td.identifier) + 4) + return '{}{} {} {}.'.format(i, td.identifier, td._operator, body) + + +def _format_typedef_body(td, indent, offset): + parts = [[]] + for term in td.conjunction.terms: + if isinstance(term, AVM) and len(parts) == 1: + parts.append([]) + parts[-1].append(term) + + if parts[0] == []: + parts = [parts[1]] + assert len(parts) <= 2 + if len(parts) == 1: + formatted_conj = _format_conjunction(td.conjunction, offset) + else: + formatted_conj = '{} &\n{}{}'.format( + _format_conjunction(Conjunction(parts[0]), offset), + ' ' * (_base_indent + indent), + _format_conjunction(Conjunction(parts[1]), _base_indent + indent)) + + if td.docstring is not None: + docstring = '\n ' + _format_docstring(td.docstring, 2) + else: + docstring = '' + + return formatted_conj + docstring + + +def _format_docstring(doc, indent): + if doc is None: + return '' + lines = textwrap.dedent(doc).splitlines() + if lines: + if lines[0].strip() == '': + lines = lines[1:] + if lines[-1].strip() == '': + lines = lines[:-1] + ind = ' ' * indent + contents = _escape_docstring( + '\n{0}{1}\n{0}'.format(ind, ('\n' + ind).join(lines))) + return f'"""{contents}"""' + + +def _escape_docstring(s): + cs = [] + cnt = 0 + lastindex = len(s) - 1 + for i, c in enumerate(s): + if cnt == -1 or c not in '"\\': + cnt = 0 + elif c == '"': + cnt += 1 + if cnt == 3 or i == lastindex: + cs.append('\\') + cnt = 0 + elif c == '\\': + cnt = -1 + cs.append(c) + return ''.join(cs) + + +def _format_morphset(obj, indent): + if isinstance(obj, LetterSet): + mstype = 'letter-set' + elif isinstance(obj, WildCard): + mstype = 'wild-card' + else: + raise TypeError(f'not a valid morph-set class: {type(obj).__name__}') + return '{}%({} ({} {}))'.format( + ' ' * indent, mstype, obj.var, obj.characters + ) + + +def _format_environment(env, indent): + status = '' + if isinstance(env, TypeEnvironment): + envtype = ':type' + elif isinstance(env, InstanceEnvironment): + envtype = ':instance' + if env.status: + status = ' :status ' + env.status + + contents = '\n'.join(format(obj, indent + 2) for obj in env.entries) + if contents: + contents += '\n' + return '{0}:begin {1}{2}.\n{3}{0}:end {1}.'.format( + ' ' * indent, envtype, status, contents) + + +def _format_include(fi, indent): + return '{}:include "{}".'.format(' ' * indent, fi.value) + + +def _format_linecomment(obj, indent): + return '{};{}'.format(' ' * indent, str(obj)) + + +def _format_blockcomment(obj, indent): + return '{}#|{}|#'.format(' ' * indent, str(obj)) diff --git a/delphin/tdl.py b/delphin/tdl/_model.py similarity index 51% rename from delphin/tdl.py rename to delphin/tdl/_model.py index 12099e8f..0fc4f4bd 100644 --- a/delphin/tdl.py +++ b/delphin/tdl/_model.py @@ -1,23 +1,9 @@ -""" -Classes and functions for parsing and inspecting TDL. -""" - -import re -import textwrap -import warnings from collections.abc import Mapping, Sequence from pathlib import Path -from typing import Generator, Optional, Tuple, Union +from typing import Optional, Union from delphin import util - -# Default modules need to import the PyDelphin version -from delphin.__about__ import __version__ # noqa: F401 -from delphin.exceptions import ( - PyDelphinException, - PyDelphinSyntaxError, - PyDelphinWarning, -) +from delphin.tdl._exceptions import TDLError from delphin.tfs import FeatureStructure # Values for list expansion @@ -28,28 +14,9 @@ DIFF_LIST_LIST = 'LIST' #: feature for diff-list lists DIFF_LIST_LAST = 'LAST' #: feature for the last path in a diff-list -# Values for serialization -_base_indent = 2 # indent when an AVM starts on the next line -_max_inline_list_items = 3 # number of list items that may appear inline -_line_width = 79 # try not to go beyond this number of characters - - AttrSeq = Sequence[tuple[str, Union['Conjunction', 'Term']]] AttrMap = Mapping[str, Union['Conjunction', 'Term']] -# Exceptions - -class TDLError(PyDelphinException): - """Raised when there is an error in processing TDL.""" - - -class TDLSyntaxError(PyDelphinSyntaxError): - """Raised when parsing TDL text fails.""" - - -class TDLWarning(PyDelphinWarning): - """Raised when parsing unsupported TDL features.""" - # Classes for TDL entities @@ -923,754 +890,3 @@ class LineComment(str): class BlockComment(str): """Multi-line comments in TDL.""" - - -# NOTE: be careful rearranging subpatterns in _tdl_lex_re; some must -# appear before others, e.g., """ before ", [\]^|]+''' -_tdl_lex_re = re.compile( - r'''# regex-pattern gid description - (""") # 1 start of multiline docstring - |(\#\|) # 2 start of multiline comment - |;([^\n]*) # 3 single-line comment - |"([^"\\]*(?:\\.[^"\\]*)*)" # 4 double-quoted "strings" - |'({identifier}) # 5 single-quoted 'symbols - |\^([^$\\]*(?:\\.|[^$\\]*)*)\$ # 6 regular expression - |(:[=<]) # 7 type def operator - |(:\+) # 8 type addendum operator - |(\.\.\.) # 9 list ellipsis - |(\.) # 10 dot operator - |(&) # 11 conjunction operator - |(,) # 12 list delimiter - |(\[) # 13 AVM open - |() # 17 diff list close - |(>) # 18 cons list close - |\#({identifier}) # 19 coreference - |%\s*\((.*)\) # 20 letter-set or wild-card - |%(prefix|suffix) # 21 start of affixing pattern - |\(([^ ]+\s+(?:[^ )\\]|\\.)+)\) # 22 affix subpattern - |(\/) # 23 defaults (currently unused) - |({identifier}) # 24 identifiers and symbols - |(:begin) # 25 start a :type or :instance block - |(:end) # 26 end a :type or :instance block - |(:type|:instance) # 27 environment type - |(:status) # 28 instance status - |(:include) # 29 file inclusion - |([^\s]) # 30 unexpected - '''.format(identifier=_identifier_pattern), - flags=re.VERBOSE | re.UNICODE) - - -# Parsing helper functions - -def _is_comment(data): - """helper function for filtering out comments""" - return 2 <= data[0] <= 3 - - -def _peek(tokens, n=0): - """peek and drop comments""" - return tokens.peek(n=n, skip=_is_comment, drop=True) - - -def _next(tokens): - """pop the next token, dropping comments""" - return tokens.next(skip=_is_comment) - - -def _shift(tokens): - """pop the next token, then peek the gid of the following""" - after = tokens.peek(n=1, skip=_is_comment, drop=True) - tok = tokens._buffer.popleft() - return tok[0], tok[1], tok[2], after[0] - - -def _lex(stream): - """ - Lex the input stream according to _tdl_lex_re. - - Yields - (gid, token, line_number) - """ - lines = enumerate(stream, 1) - line_no = pos = 0 - try: - while True: - if pos == 0: - line_no, line = next(lines) - matches = _tdl_lex_re.finditer(line, pos) - pos = 0 # reset; only used for multiline patterns - for m in matches: - gid = m.lastindex - if gid <= 2: # potentially multiline patterns - if gid == 1: # docstring - s, start_line_no, line_no, line, pos = _bounded( - '"""', '"""', line, m.end(), line_no, lines) - elif gid == 2: # comment - s, start_line_no, line_no, line, pos = _bounded( - '#|', '|#', line, m.end(), line_no, lines) - yield (gid, s, line_no) - break - elif gid == 30: - raise TDLSyntaxError( - lineno=line_no, - offset=m.start(), - text=line) - else: - # token = None - # if not (6 < gid < 20): - # token = m.group(gid) - token = m.group(gid) - yield (gid, token, line_no) - except StopIteration: - pass - - -def _bounded(p1, p2, line, pos, line_no, lines): - """Collect the contents of a bounded multiline string""" - substrings = [] - start_line_no = line_no - end = pos - while not line.startswith(p2, end): - if line[end] == '\\': - end += 2 - else: - end += 1 - if end >= len(line): - substrings.append(line[pos:]) - try: - line_no, line = next(lines) - except StopIteration: - pattern = 'docstring' if p1 == '"""' else 'block comment' - raise TDLSyntaxError( - f'unterminated {pattern}', - lineno=start_line_no - ) from None - pos = end = 0 - substrings.append(line[pos:end]) - end += len(p2) - return ''.join(substrings), start_line_no, line_no, line, end - - -# Parsing functions - -ParseEvent = Tuple[ - str, - Union[str, TypeDefinition, _MorphSet, _Environment, FileInclude], - int -] - - -def iterparse(path: util.PathLike, - encoding: str = 'utf-8') -> Generator[ParseEvent, None, None]: - """ - Parse the TDL file at *path* and iteratively yield parse events. - - Parse events are `(event, object, lineno)` tuples, where `event` - is a string (`"TypeDefinition"`, `"TypeAddendum"`, - `"LexicalRuleDefinition"`, `"LetterSet"`, `"WildCard"`, - `"BeginEnvironment"`, `"EndEnvironment"`, `"FileInclude"`, - `"LineComment"`, or `"BlockComment"`), `object` is the interpreted - TDL object, and `lineno` is the line number where the entity began - in *path*. - - Args: - path: path to a TDL file - encoding (str): the encoding of the file (default: `"utf-8"`) - Yields: - `(event, object, lineno)` tuples - Example: - >>> lex = {} - >>> for event, obj, lineno in tdl.iterparse('erg/lexicon.tdl'): - ... if event == 'TypeDefinition': - ... lex[obj.identifier] = obj - ... - >>> lex['eucalyptus_n1']['SYNSEM.LKEYS.KEYREL.PRED'] - - """ - path = Path(path).expanduser() - with path.open(encoding=encoding) as fh: - yield from _parse(fh, path) - - -def _parse(f, path): - tokens = util.LookaheadIterator(_lex(f)) - try: - yield from _parse_tdl(tokens, path) - except TDLSyntaxError as ex: - ex.filename = str(path) - raise - except RecursionError as exc: - raise TDLError( - "excessively recursive TDL structure (perhaps there's " - "a very long list); try increasing Python's recursion " - "limit with sys.setrecursionlimit(n)" - ) from exc - - -def _parse_tdl(tokens, path): - environment = None - envstack = [] - try: - line_no = 1 - while True: - obj = None - try: - gid, token, line_no = tokens.next() - except StopIteration: # normal EOF - break - if gid == 2: - yield ('BlockComment', BlockComment(token), line_no) - elif gid == 3: - yield ('LineComment', LineComment(token), line_no) - elif gid == 20: - obj = _parse_letterset(token, line_no) - yield (obj.__class__.__name__, obj, line_no) - elif gid == 24: - obj = _parse_tdl_definition(token, tokens) - yield (obj.__class__.__name__, obj, line_no) - elif gid == 25: - envstack.append(environment) - _environment = _parse_tdl_begin_environment(tokens) - if environment is not None: - environment.entries.append(_environment) - environment = _environment - yield ('BeginEnvironment', environment, line_no) - elif gid == 26: - _parse_tdl_end_environment(tokens, environment) - yield ('EndEnvironment', environment, line_no) - environment = envstack.pop() - elif gid == 29: - obj = _parse_tdl_include(tokens, path.parent) - yield ('FileInclude', obj, line_no) - else: - raise TDLSyntaxError( - f'unexpected token: {token}', - lineno=line_no) - if environment is not None and obj is not None: - environment.entries.append(obj) - except StopIteration: - raise TDLSyntaxError('unexpected end of input.') from None - - -def _parse_tdl_definition(identifier, tokens): - gid, token, line_no, nextgid = _shift(tokens) - - if gid == 7 and nextgid == 21: # lex rule with affixes - atype, pats = _parse_tdl_affixes(tokens) - conjunction, nextgid = _parse_tdl_conjunction(tokens) - obj = LexicalRuleDefinition( - identifier, atype, pats, conjunction) - - elif gid == 7: - if token == ':<': - warnings.warn( - 'Subtype operator :< encountered at line {} for ' - '{}; Continuing as if it were the := operator.' - .format(line_no, identifier), - TDLWarning, - stacklevel=2, - ) - conjunction, nextgid = _parse_tdl_conjunction(tokens) - if isinstance(conjunction, Term): - conjunction = Conjunction([conjunction]) - if len(conjunction.types()) == 0: - raise TDLSyntaxError( - f'no supertypes defined on {identifier}', - lineno=line_no) - obj = TypeDefinition(identifier, conjunction) - - elif gid == 8: - if nextgid == 1 and _peek(tokens, n=1)[0] == 10: - # docstring will be handled after the if-block - conjunction = Conjunction() - else: - conjunction, nextgid = _parse_tdl_conjunction(tokens) - obj = TypeAddendum(identifier, conjunction) - - else: - raise TDLSyntaxError("expected: := or :+", - lineno=line_no) - - if nextgid == 1: # pre-dot docstring - _, token, _, nextgid = _shift(tokens) - obj.docstring = token - if nextgid != 10: # . dot - raise TDLSyntaxError('expected: .', lineno=line_no) - tokens.next() - - return obj - - -def _parse_letterset(token, line_no): - end = r'\s+((?:[^) \\]|\\.)+)\)' - m = re.match(r'\s*letter-set\s*\((!.)' + end, token) - if m is not None: - chars = re.sub(r'\\(.)', r'\1', m.group(2)) - return LetterSet(m.group(1), chars) - else: - m = re.match(r'\s*wild-card\s*\((\?.)' + end, token) - if m is not None: - chars = re.sub(r'\\(.)', r'\1', m.group(2)) - return WildCard(m.group(1), chars) - # if execution reached here there was a problems - raise TDLSyntaxError( - f'invalid letter-set or wild-card: {token}', - lineno=line_no) - - -def _parse_tdl_affixes(tokens): - gid, token, line_no, nextgid = _shift(tokens) - assert gid == 21 - affixtype = token - affixes = [] - while nextgid == 22: - gid, token, line_no, nextgid = _shift(tokens) - match, replacement = token.split(None, 1) - affixes.append((match, replacement)) - return affixtype, affixes - - -def _parse_tdl_conjunction(tokens): - terms = [] - while True: - term, nextgid = _parse_tdl_term(tokens) - terms.append(term) - if nextgid == 11: # & operator - tokens.next() - else: - break - if len(terms) == 1: - return terms[0], nextgid - else: - return Conjunction(terms), nextgid - - -def _parse_tdl_term(tokens): - doc = None - - gid, token, line_no, nextgid = _shift(tokens) - - # docstrings are not part of the conjunction so check separately - if gid == 1: # docstring - doc = token - gid, token, line_no, nextgid = _shift(tokens) - - if gid == 4: # string - term = String(token, docstring=doc) - elif gid == 5: # quoted symbol - warnings.warn( - f'Single-quoted symbol encountered at line {line_no}; ' - 'Continuing as if it were a regular symbol.', - TDLWarning, - stacklevel=2, - ) - term = TypeIdentifier(token, docstring=doc) - elif gid == 6: # regex - term = Regex(token, docstring=doc) - elif gid == 13: # AVM open - featvals, nextgid = _parse_tdl_feature_structure(tokens) - term = AVM(featvals, docstring=doc) - elif gid == 14: # diff list open - values, _, nextgid = _parse_tdl_list(tokens, break_gid=17) - term = DiffList(values, docstring=doc) - elif gid == 15: # cons list open - values, end, nextgid = _parse_tdl_list(tokens, break_gid=18) - term = ConsList(values, end=end, docstring=doc) - elif gid == 19: # coreference - term = Coreference(token, docstring=doc) - elif gid == 24: # identifier - term = TypeIdentifier(token, docstring=doc) - else: - raise TDLSyntaxError('expected a TDL conjunction term.', - lineno=line_no, text=token) - return term, nextgid - - -def _parse_tdl_feature_structure(tokens): - feats = [] - gid, token, line_no, nextgid = _shift(tokens) - if gid != 16: # ] feature structure terminator - while True: - if gid != 24: # identifier (attribute name) - raise TDLSyntaxError('Expected a feature name', - lineno=line_no, text=token) - path = [token] - while nextgid == 10: # . dot - tokens.next() - gid, token, line_no, nextgid = _shift(tokens) - assert gid == 24 - path.append(token) - attr = '.'.join(path) - - conjunction, nextgid = _parse_tdl_conjunction(tokens) - feats.append((attr, conjunction)) - - if nextgid == 12: # , list delimiter - tokens.next() - gid, token, line_no, nextgid = _shift(tokens) - elif nextgid == 16: - gid, _, _, nextgid = _shift(tokens) - break - else: - raise TDLSyntaxError('expected: , or ]', - lineno=line_no) - - assert gid == 16 - - return feats, nextgid - - -def _parse_tdl_list(tokens, break_gid): - values = [] - end = None - nextgid = _peek(tokens)[0] - if nextgid == break_gid: - _, _, _, nextgid = _shift(tokens) - else: - while True: - if nextgid == 9: # ... ellipsis - _, _, _, nextgid = _shift(tokens) - end = LIST_TYPE - break - else: - term, nextgid = _parse_tdl_conjunction(tokens) - values.append(term) - - if nextgid == 10: # . dot - tokens.next() - end, nextgid = _parse_tdl_conjunction(tokens) - break - elif nextgid == break_gid: - break - elif nextgid == 12: # , comma delimiter - _, _, _, nextgid = _shift(tokens) - else: - raise TDLSyntaxError('expected: comma or end of list') - - gid, _, line_no, nextgid = _shift(tokens) - if gid != break_gid: - raise TDLSyntaxError('expected: end of list', - lineno=line_no) - - if len(values) == 0 and end is None: - end = EMPTY_LIST_TYPE - - return values, end, nextgid - - -def _parse_tdl_begin_environment(tokens): - gid, envtype, lineno = tokens.next() - if gid != 27: - raise TDLSyntaxError('expected: :type or :instance', - lineno=lineno, text=envtype) - gid, token, lineno = tokens.next() - if envtype == ':instance': - status = envtype[1:] - if token == ':status': - status = tokens.next()[1] - gid, token, lineno = tokens.next() - elif gid != 10: - raise TDLSyntaxError('expected: :status or .', - lineno=lineno) - env = InstanceEnvironment(status) - else: - env = TypeEnvironment() - if gid != 10: - raise TDLSyntaxError('expected: .', lineno=lineno, text=token) - return env - - -def _parse_tdl_end_environment(tokens, env): - _, envtype, lineno = tokens.next() - if envtype == ':type' and not isinstance(env, TypeEnvironment): - raise TDLSyntaxError('expected: :type', lineno=lineno, text=envtype) - elif envtype == ':instance' and not isinstance(env, InstanceEnvironment): - raise TDLSyntaxError('expected: :instance', - lineno=lineno, text=envtype) - gid, _, lineno = tokens.next() - if gid != 10: - raise TDLSyntaxError('expected: .', lineno=lineno) - return envtype - - -def _parse_tdl_include(tokens, basedir): - gid, value, lineno = tokens.next() - if gid != 4: - raise TDLSyntaxError('expected: a quoted filename', - lineno=lineno, text=value) - gid, _, lineno = tokens.next() - if gid != 10: - raise TDLSyntaxError('expected: .', lineno=lineno) - return FileInclude(value, basedir=basedir) - - -# Serialization helpers - -def format(obj, indent=0): - """ - Serialize TDL objects to strings. - - Args: - obj: instance of :class:`Term`, :class:`Conjunction`, or - :class:`TypeDefinition` classes or subclasses - indent (int): number of spaces to indent the formatted object - Returns: - str: serialized form of *obj* - Example: - >>> conj = tdl.Conjunction([ - ... tdl.TypeIdentifier('lex-item'), - ... tdl.AVM([('SYNSEM.LOCAL.CAT.HEAD.MOD', - ... tdl.ConsList(end=tdl.EMPTY_LIST_TYPE))]) - ... ]) - >>> t = tdl.TypeDefinition('non-mod-lex-item', conj) - >>> print(format(t)) - non-mod-lex-item := lex-item & - [ SYNSEM.LOCAL.CAT.HEAD.MOD < > ]. - """ - if isinstance(obj, TypeDefinition): - return _format_typedef(obj, indent) - elif isinstance(obj, Conjunction): - return _format_conjunction(obj, indent) - elif isinstance(obj, Term): - return _format_term(obj, indent) - elif isinstance(obj, _MorphSet): - return _format_morphset(obj, indent) - elif isinstance(obj, _Environment): - return _format_environment(obj, indent) - elif isinstance(obj, FileInclude): - return _format_include(obj, indent) - elif isinstance(obj, LineComment): - return _format_linecomment(obj, indent) - elif isinstance(obj, BlockComment): - return _format_blockcomment(obj, indent) - else: - raise ValueError(f'cannot format object as TDL: {obj!r}') - - -def _format_term(term, indent): - fmt = { - TypeIdentifier: _format_id, - String: _format_string, - Regex: _format_regex, - Coreference: _format_coref, - AVM: _format_avm, - _ImplicitAVM: _format_avm, - ConsList: _format_conslist, - DiffList: _format_difflist, - }.get(term.__class__, None) - - if fmt is None: - raise TDLError('not a valid term: {}' - .format(type(term).__name__)) - - if term.docstring is not None: - return '{}\n{}{}'.format( - _format_docstring(term.docstring, indent), - ' ' * indent, - fmt(term, indent)) - else: - return fmt(term, indent) - - -def _format_id(term, indent): - return str(term) - - -def _format_string(term, indent): - return f'"{term!s}"' - - -def _format_regex(term, indent): - return f'^{term!s}$' - - -def _format_coref(term, indent): - return f'#{term!s}' - - -def _format_avm(avm, indent): - lines = [] - for feat, val in avm.features(): - val = _format_conjunction(val, indent + len(feat) + 3) - if not val.startswith('\n'): - feat += ' ' - lines.append(feat + val) - if not lines: - return '[ ]' - else: - return '[ {} ]'.format((',\n' + ' ' * (indent + 2)).join(lines)) - - -def _format_conslist(cl, indent): - values = [_format_conjunction(val, indent + 2) # 2 = len('< ') - for val in cl.values()] - end = '' - if not cl.terminated: - if values: - end = ', ...' - else: - values = ['...'] - elif cl._avm is not None and cl[cl._last_path] is not None: - end = ' . ' + values[-1] - values = values[:-1] - - if not values: # only if no values and terminated - return '< >' - elif (len(values) <= _max_inline_list_items - and sum(len(v) + 2 for v in values) + 2 + indent <= _line_width): - return '< {} >'.format(', '.join(values) + end) - else: - i = ' ' * (indent + 2) # 2 = len('< ') - lines = [f'< {values[0]}'] - lines.extend(i + val for val in values[1:]) - return ',\n'.join(lines) + end + ' >' - - -def _format_difflist(dl, indent): - values = [_format_conjunction(val, indent + 3) # 3 == len('' - elif (len(values) <= _max_inline_list_items - and sum(len(v) + 2 for v in values) + 4 + indent <= _line_width): - return ''.format(', '.join(values)) - else: - # i = ' ' * (indent + 3) # 3 == len(''.format( - (',\n' + ' ' * (indent + 3)).join(values)) - # values[0])] - # lines.extend(i + val for val in values[1:]) - # return ',\n'.join(lines) + ' !>' - - -def _format_conjunction(conj, indent): - if isinstance(conj, Term): - return _format_term(conj, indent) - elif len(conj._terms) == 0: - return '' - else: - tokens = [] - width = indent - for term in conj._terms: - tok = _format_term(term, width) - flen = max(len(s) for s in tok.splitlines()) - width += flen + 3 # 3 == len(' & ') - tokens.append(tok) - lines = [tokens] # all terms joined without newlines (for now) - return (' &\n' + ' ' * indent).join( - ' & '.join(line) for line in lines if line) - - -def _format_typedef(td, indent): - i = ' ' * indent - if hasattr(td, 'affix_type'): - patterns = ' '.join(f'({a} {b})' for a, b in td.patterns) - body = _format_typedef_body(td, indent, indent + 2) - return '{}{} {}\n%{} {}\n {}.'.format( - i, td.identifier, td._operator, td.affix_type, patterns, body) - else: - body = _format_typedef_body( - td, indent, indent + len(td.identifier) + 4) - return '{}{} {} {}.'.format(i, td.identifier, td._operator, body) - - -def _format_typedef_body(td, indent, offset): - parts = [[]] - for term in td.conjunction.terms: - if isinstance(term, AVM) and len(parts) == 1: - parts.append([]) - parts[-1].append(term) - - if parts[0] == []: - parts = [parts[1]] - assert len(parts) <= 2 - if len(parts) == 1: - formatted_conj = _format_conjunction(td.conjunction, offset) - else: - formatted_conj = '{} &\n{}{}'.format( - _format_conjunction(Conjunction(parts[0]), offset), - ' ' * (_base_indent + indent), - _format_conjunction(Conjunction(parts[1]), _base_indent + indent)) - - if td.docstring is not None: - docstring = '\n ' + _format_docstring(td.docstring, 2) - else: - docstring = '' - - return formatted_conj + docstring - - -def _format_docstring(doc, indent): - if doc is None: - return '' - lines = textwrap.dedent(doc).splitlines() - if lines: - if lines[0].strip() == '': - lines = lines[1:] - if lines[-1].strip() == '': - lines = lines[:-1] - ind = ' ' * indent - contents = _escape_docstring( - '\n{0}{1}\n{0}'.format(ind, ('\n' + ind).join(lines))) - return f'"""{contents}"""' - - -def _escape_docstring(s): - cs = [] - cnt = 0 - lastindex = len(s) - 1 - for i, c in enumerate(s): - if cnt == -1 or c not in '"\\': - cnt = 0 - elif c == '"': - cnt += 1 - if cnt == 3 or i == lastindex: - cs.append('\\') - cnt = 0 - elif c == '\\': - cnt = -1 - cs.append(c) - return ''.join(cs) - - -def _format_morphset(obj, indent): - mstype = 'letter-set' if isinstance(obj, LetterSet) else 'wild-card' - return '{}%({} ({} {}))'.format( - ' ' * indent, mstype, obj.var, obj.characters) - - -def _format_environment(env, indent): - status = '' - if isinstance(env, TypeEnvironment): - envtype = ':type' - elif isinstance(env, InstanceEnvironment): - envtype = ':instance' - if env.status: - status = ' :status ' + env.status - - contents = '\n'.join(format(obj, indent + 2) for obj in env.entries) - if contents: - contents += '\n' - return '{0}:begin {1}{2}.\n{3}{0}:end {1}.'.format( - ' ' * indent, envtype, status, contents) - - -def _format_include(fi, indent): - return '{}:include "{}".'.format(' ' * indent, fi.value) - - -def _format_linecomment(obj, indent): - return '{};{}'.format(' ' * indent, str(obj)) - - -def _format_blockcomment(obj, indent): - return '{}#|{}|#'.format(' ' * indent, str(obj)) diff --git a/delphin/tdl/_parse.py b/delphin/tdl/_parse.py new file mode 100644 index 00000000..ff9888eb --- /dev/null +++ b/delphin/tdl/_parse.py @@ -0,0 +1,519 @@ +import re +import warnings +from pathlib import Path +from typing import Generator, Union + +from delphin import util +from delphin.tdl._exceptions import TDLError, TDLSyntaxError, TDLWarning +from delphin.tdl._model import ( + AVM, + EMPTY_LIST_TYPE, + LIST_TYPE, + BlockComment, + Conjunction, + ConsList, + Coreference, + DiffList, + FileInclude, + InstanceEnvironment, + LetterSet, + LexicalRuleDefinition, + LineComment, + Regex, + String, + Term, + TypeAddendum, + TypeDefinition, + TypeEnvironment, + TypeIdentifier, + WildCard, + _Environment, + _MorphSet, +) + +# NOTE: be careful rearranging subpatterns in _tdl_lex_re; some must +# appear before others, e.g., """ before ", [\]^|]+''' +_tdl_lex_re = re.compile( + r'''# regex-pattern gid description + (""") # 1 start of multiline docstring + |(\#\|) # 2 start of multiline comment + |;([^\n]*) # 3 single-line comment + |"([^"\\]*(?:\\.[^"\\]*)*)" # 4 double-quoted "strings" + |'({identifier}) # 5 single-quoted 'symbols + |\^([^$\\]*(?:\\.|[^$\\]*)*)\$ # 6 regular expression + |(:[=<]) # 7 type def operator + |(:\+) # 8 type addendum operator + |(\.\.\.) # 9 list ellipsis + |(\.) # 10 dot operator + |(&) # 11 conjunction operator + |(,) # 12 list delimiter + |(\[) # 13 AVM open + |() # 17 diff list close + |(>) # 18 cons list close + |\#({identifier}) # 19 coreference + |%\s*\((.*)\) # 20 letter-set or wild-card + |%(prefix|suffix) # 21 start of affixing pattern + |\(([^ ]+\s+(?:[^ )\\]|\\.)+)\) # 22 affix subpattern + |(\/) # 23 defaults (currently unused) + |({identifier}) # 24 identifiers and symbols + |(:begin) # 25 start a :type or :instance block + |(:end) # 26 end a :type or :instance block + |(:type|:instance) # 27 environment type + |(:status) # 28 instance status + |(:include) # 29 file inclusion + |([^\s]) # 30 unexpected + '''.format(identifier=_identifier_pattern), + flags=re.VERBOSE | re.UNICODE) + + +# Parsing helper functions + +def _is_comment(data): + """helper function for filtering out comments""" + return 2 <= data[0] <= 3 + + +def _peek(tokens, n=0): + """peek and drop comments""" + return tokens.peek(n=n, skip=_is_comment, drop=True) + + +def _next(tokens): + """pop the next token, dropping comments""" + return tokens.next(skip=_is_comment) + + +def _shift(tokens): + """pop the next token, then peek the gid of the following""" + after = tokens.peek(n=1, skip=_is_comment, drop=True) + tok = tokens._buffer.popleft() + return tok[0], tok[1], tok[2], after[0] + + +def _lex(stream): + """ + Lex the input stream according to _tdl_lex_re. + + Yields + (gid, token, line_number) + """ + lines = enumerate(stream, 1) + line_no = pos = 0 + try: + while True: + if pos == 0: + line_no, line = next(lines) + matches = _tdl_lex_re.finditer(line, pos) + pos = 0 # reset; only used for multiline patterns + for m in matches: + gid = m.lastindex + if gid <= 2: # potentially multiline patterns + if gid == 1: # docstring + s, start_line_no, line_no, line, pos = _bounded( + '"""', '"""', line, m.end(), line_no, lines) + elif gid == 2: # comment + s, start_line_no, line_no, line, pos = _bounded( + '#|', '|#', line, m.end(), line_no, lines) + yield (gid, s, line_no) + break + elif gid == 30: + raise TDLSyntaxError( + lineno=line_no, + offset=m.start(), + text=line) + else: + # token = None + # if not (6 < gid < 20): + # token = m.group(gid) + token = m.group(gid) + yield (gid, token, line_no) + except StopIteration: + pass + + +def _bounded(p1, p2, line, pos, line_no, lines): + """Collect the contents of a bounded multiline string""" + substrings = [] + start_line_no = line_no + end = pos + while not line.startswith(p2, end): + if line[end] == '\\': + end += 2 + else: + end += 1 + if end >= len(line): + substrings.append(line[pos:]) + try: + line_no, line = next(lines) + except StopIteration: + pattern = 'docstring' if p1 == '"""' else 'block comment' + raise TDLSyntaxError( + f'unterminated {pattern}', + lineno=start_line_no + ) from None + pos = end = 0 + substrings.append(line[pos:end]) + end += len(p2) + return ''.join(substrings), start_line_no, line_no, line, end + + +# Parsing functions + +ParseEvent = tuple[ + str, + Union[str, TypeDefinition, _MorphSet, _Environment, FileInclude], + int +] + + +def iterparse(path: util.PathLike, + encoding: str = 'utf-8') -> Generator[ParseEvent, None, None]: + """ + Parse the TDL file at *path* and iteratively yield parse events. + + Parse events are `(event, object, lineno)` tuples, where `event` + is a string (`"TypeDefinition"`, `"TypeAddendum"`, + `"LexicalRuleDefinition"`, `"LetterSet"`, `"WildCard"`, + `"BeginEnvironment"`, `"EndEnvironment"`, `"FileInclude"`, + `"LineComment"`, or `"BlockComment"`), `object` is the interpreted + TDL object, and `lineno` is the line number where the entity began + in *path*. + + Args: + path: path to a TDL file + encoding (str): the encoding of the file (default: `"utf-8"`) + Yields: + `(event, object, lineno)` tuples + Example: + >>> lex = {} + >>> for event, obj, lineno in tdl.iterparse('erg/lexicon.tdl'): + ... if event == 'TypeDefinition': + ... lex[obj.identifier] = obj + ... + >>> lex['eucalyptus_n1']['SYNSEM.LKEYS.KEYREL.PRED'] + + """ + path = Path(path).expanduser() + with path.open(encoding=encoding) as fh: + yield from _parse(fh, path) + + +def _parse(f, path): + tokens = util.LookaheadIterator(_lex(f)) + try: + yield from _parse_tdl(tokens, path) + except TDLSyntaxError as ex: + ex.filename = str(path) + raise + except RecursionError as exc: + raise TDLError( + "excessively recursive TDL structure (perhaps there's " + "a very long list); try increasing Python's recursion " + "limit with sys.setrecursionlimit(n)" + ) from exc + + +def _parse_tdl(tokens, path): + environment = None + envstack = [] + try: + line_no = 1 + while True: + obj = None + try: + gid, token, line_no = tokens.next() + except StopIteration: # normal EOF + break + if gid == 2: + yield ('BlockComment', BlockComment(token), line_no) + elif gid == 3: + yield ('LineComment', LineComment(token), line_no) + elif gid == 20: + obj = _parse_letterset(token, line_no) + yield (obj.__class__.__name__, obj, line_no) + elif gid == 24: + obj = _parse_tdl_definition(token, tokens) + yield (obj.__class__.__name__, obj, line_no) + elif gid == 25: + envstack.append(environment) + _environment = _parse_tdl_begin_environment(tokens) + if environment is not None: + environment.entries.append(_environment) + environment = _environment + yield ('BeginEnvironment', environment, line_no) + elif gid == 26: + _parse_tdl_end_environment(tokens, environment) + yield ('EndEnvironment', environment, line_no) + environment = envstack.pop() + elif gid == 29: + obj = _parse_tdl_include(tokens, path.parent) + yield ('FileInclude', obj, line_no) + else: + raise TDLSyntaxError( + f'unexpected token: {token}', + lineno=line_no) + if environment is not None and obj is not None: + environment.entries.append(obj) + except StopIteration: + raise TDLSyntaxError('unexpected end of input.') from None + + +def _parse_tdl_definition(identifier, tokens): + gid, token, line_no, nextgid = _shift(tokens) + + if gid == 7 and nextgid == 21: # lex rule with affixes + atype, pats = _parse_tdl_affixes(tokens) + conjunction, nextgid = _parse_tdl_conjunction(tokens) + obj = LexicalRuleDefinition( + identifier, atype, pats, conjunction) + + elif gid == 7: + if token == ':<': + warnings.warn( + 'Subtype operator :< encountered at line {} for ' + '{}; Continuing as if it were the := operator.' + .format(line_no, identifier), + TDLWarning, + stacklevel=2, + ) + conjunction, nextgid = _parse_tdl_conjunction(tokens) + if isinstance(conjunction, Term): + conjunction = Conjunction([conjunction]) + if len(conjunction.types()) == 0: + raise TDLSyntaxError( + f'no supertypes defined on {identifier}', + lineno=line_no) + obj = TypeDefinition(identifier, conjunction) + + elif gid == 8: + if nextgid == 1 and _peek(tokens, n=1)[0] == 10: + # docstring will be handled after the if-block + conjunction = Conjunction() + else: + conjunction, nextgid = _parse_tdl_conjunction(tokens) + obj = TypeAddendum(identifier, conjunction) + + else: + raise TDLSyntaxError("expected: := or :+", + lineno=line_no) + + if nextgid == 1: # pre-dot docstring + _, token, _, nextgid = _shift(tokens) + obj.docstring = token + if nextgid != 10: # . dot + raise TDLSyntaxError('expected: .', lineno=line_no) + tokens.next() + + return obj + + +def _parse_letterset(token, line_no): + end = r'\s+((?:[^) \\]|\\.)+)\)' + m = re.match(r'\s*letter-set\s*\((!.)' + end, token) + if m is not None: + chars = re.sub(r'\\(.)', r'\1', m.group(2)) + return LetterSet(m.group(1), chars) + else: + m = re.match(r'\s*wild-card\s*\((\?.)' + end, token) + if m is not None: + chars = re.sub(r'\\(.)', r'\1', m.group(2)) + return WildCard(m.group(1), chars) + # if execution reached here there was a problems + raise TDLSyntaxError( + f'invalid letter-set or wild-card: {token}', + lineno=line_no) + + +def _parse_tdl_affixes(tokens): + gid, token, line_no, nextgid = _shift(tokens) + assert gid == 21 + affixtype = token + affixes = [] + while nextgid == 22: + gid, token, line_no, nextgid = _shift(tokens) + match, replacement = token.split(None, 1) + affixes.append((match, replacement)) + return affixtype, affixes + + +def _parse_tdl_conjunction(tokens): + terms = [] + while True: + term, nextgid = _parse_tdl_term(tokens) + terms.append(term) + if nextgid == 11: # & operator + tokens.next() + else: + break + if len(terms) == 1: + return terms[0], nextgid + else: + return Conjunction(terms), nextgid + + +def _parse_tdl_term(tokens): + doc = None + + gid, token, line_no, nextgid = _shift(tokens) + + # docstrings are not part of the conjunction so check separately + if gid == 1: # docstring + doc = token + gid, token, line_no, nextgid = _shift(tokens) + + if gid == 4: # string + term = String(token, docstring=doc) + elif gid == 5: # quoted symbol + warnings.warn( + f'Single-quoted symbol encountered at line {line_no}; ' + 'Continuing as if it were a regular symbol.', + TDLWarning, + stacklevel=2, + ) + term = TypeIdentifier(token, docstring=doc) + elif gid == 6: # regex + term = Regex(token, docstring=doc) + elif gid == 13: # AVM open + featvals, nextgid = _parse_tdl_feature_structure(tokens) + term = AVM(featvals, docstring=doc) + elif gid == 14: # diff list open + values, _, nextgid = _parse_tdl_list(tokens, break_gid=17) + term = DiffList(values, docstring=doc) + elif gid == 15: # cons list open + values, end, nextgid = _parse_tdl_list(tokens, break_gid=18) + term = ConsList(values, end=end, docstring=doc) + elif gid == 19: # coreference + term = Coreference(token, docstring=doc) + elif gid == 24: # identifier + term = TypeIdentifier(token, docstring=doc) + else: + raise TDLSyntaxError('expected a TDL conjunction term.', + lineno=line_no, text=token) + return term, nextgid + + +def _parse_tdl_feature_structure(tokens): + feats = [] + gid, token, line_no, nextgid = _shift(tokens) + if gid != 16: # ] feature structure terminator + while True: + if gid != 24: # identifier (attribute name) + raise TDLSyntaxError('Expected a feature name', + lineno=line_no, text=token) + path = [token] + while nextgid == 10: # . dot + tokens.next() + gid, token, line_no, nextgid = _shift(tokens) + assert gid == 24 + path.append(token) + attr = '.'.join(path) + + conjunction, nextgid = _parse_tdl_conjunction(tokens) + feats.append((attr, conjunction)) + + if nextgid == 12: # , list delimiter + tokens.next() + gid, token, line_no, nextgid = _shift(tokens) + elif nextgid == 16: + gid, _, _, nextgid = _shift(tokens) + break + else: + raise TDLSyntaxError('expected: , or ]', + lineno=line_no) + + assert gid == 16 + + return feats, nextgid + + +def _parse_tdl_list(tokens, break_gid): + values = [] + end = None + nextgid = _peek(tokens)[0] + if nextgid == break_gid: + _, _, _, nextgid = _shift(tokens) + else: + while True: + if nextgid == 9: # ... ellipsis + _, _, _, nextgid = _shift(tokens) + end = LIST_TYPE + break + else: + term, nextgid = _parse_tdl_conjunction(tokens) + values.append(term) + + if nextgid == 10: # . dot + tokens.next() + end, nextgid = _parse_tdl_conjunction(tokens) + break + elif nextgid == break_gid: + break + elif nextgid == 12: # , comma delimiter + _, _, _, nextgid = _shift(tokens) + else: + raise TDLSyntaxError('expected: comma or end of list') + + gid, _, line_no, nextgid = _shift(tokens) + if gid != break_gid: + raise TDLSyntaxError('expected: end of list', + lineno=line_no) + + if len(values) == 0 and end is None: + end = EMPTY_LIST_TYPE + + return values, end, nextgid + + +def _parse_tdl_begin_environment(tokens): + gid, envtype, lineno = tokens.next() + if gid != 27: + raise TDLSyntaxError('expected: :type or :instance', + lineno=lineno, text=envtype) + gid, token, lineno = tokens.next() + if envtype == ':instance': + status = envtype[1:] + if token == ':status': + status = tokens.next()[1] + gid, token, lineno = tokens.next() + elif gid != 10: + raise TDLSyntaxError('expected: :status or .', + lineno=lineno) + env = InstanceEnvironment(status) + else: + env = TypeEnvironment() + if gid != 10: + raise TDLSyntaxError('expected: .', lineno=lineno, text=token) + return env + + +def _parse_tdl_end_environment(tokens, env): + _, envtype, lineno = tokens.next() + if envtype == ':type' and not isinstance(env, TypeEnvironment): + raise TDLSyntaxError('expected: :type', lineno=lineno, text=envtype) + elif envtype == ':instance' and not isinstance(env, InstanceEnvironment): + raise TDLSyntaxError('expected: :instance', + lineno=lineno, text=envtype) + gid, _, lineno = tokens.next() + if gid != 10: + raise TDLSyntaxError('expected: .', lineno=lineno) + return envtype + + +def _parse_tdl_include(tokens, basedir): + gid, value, lineno = tokens.next() + if gid != 4: + raise TDLSyntaxError('expected: a quoted filename', + lineno=lineno, text=value) + gid, _, lineno = tokens.next() + if gid != 10: + raise TDLSyntaxError('expected: .', lineno=lineno) + return FileInclude(value, basedir=basedir) From 2a2bcdda0bd5069d4bd7f26a192ef5bab9188189 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman <1428419+goodmami@users.noreply.github.com> Date: Mon, 7 Jul 2025 15:08:55 -0700 Subject: [PATCH 2/2] Add basic TDL config file parsing support Fixes #391 --- CHANGELOG.md | 6 ++++ delphin/tdl/__init__.py | 4 +++ delphin/tdl/_format.py | 26 +++++++++++++++-- delphin/tdl/_model.py | 33 +++++++++++++++++++-- delphin/tdl/_parse.py | 46 +++++++++++++++++++++++++---- tests/tdl_test.py | 64 ++++++++++++++++++++++++++++++++++++++++- 6 files changed, 168 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index db06f597..1c8d92a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,11 @@ ### Unreleased +### Added + +* `delphin.tdl.ConfigEntry` ([#391]) +* `delphin.tdl.ConfigEnvironment` ([#391]) + ### Maintenance * Removed `requirements.txt`; it was unnecessary and out of date @@ -1701,6 +1706,7 @@ information about changes, except for [#379]: https://github.com/delph-in/pydelphin/issues/379 [#383]: https://github.com/delph-in/pydelphin/issues/383 [#386]: https://github.com/delph-in/pydelphin/issues/386 +[#391]: https://github.com/delph-in/pydelphin/issues/391 [#395]: https://github.com/delph-in/pydelphin/issues/395 [#396]: https://github.com/delph-in/pydelphin/issues/396 [#402]: https://github.com/delph-in/pydelphin/issues/402 diff --git a/delphin/tdl/__init__.py b/delphin/tdl/__init__.py index 903bed42..eb13fa60 100644 --- a/delphin/tdl/__init__.py +++ b/delphin/tdl/__init__.py @@ -7,6 +7,8 @@ 'LIST_TAIL', 'LIST_TYPE', 'BlockComment', + 'ConfigEntry', + 'ConfigEnvironment', 'Conjunction', 'ConsList', 'Coreference', @@ -48,6 +50,8 @@ LIST_TAIL, LIST_TYPE, BlockComment, + ConfigEntry, + ConfigEnvironment, Conjunction, ConsList, Coreference, diff --git a/delphin/tdl/_format.py b/delphin/tdl/_format.py index 3b8679ca..1af2f36e 100644 --- a/delphin/tdl/_format.py +++ b/delphin/tdl/_format.py @@ -1,9 +1,12 @@ +import re import textwrap from delphin.tdl._exceptions import TDLError from delphin.tdl._model import ( AVM, BlockComment, + ConfigEntry, + ConfigEnvironment, Conjunction, ConsList, Coreference, @@ -68,6 +71,8 @@ def format(obj, indent=0): return _format_linecomment(obj, indent) elif isinstance(obj, BlockComment): return _format_blockcomment(obj, indent) + elif isinstance(obj, ConfigEntry): + return _format_configentry(obj, indent) else: raise ValueError(f'cannot format object as TDL: {obj!r}') @@ -271,19 +276,34 @@ def _format_morphset(obj, indent): def _format_environment(env, indent): - status = '' + post = '' if isinstance(env, TypeEnvironment): envtype = ':type' elif isinstance(env, InstanceEnvironment): envtype = ':instance' if env.status: - status = ' :status ' + env.status + post = ' :status ' + env.status + elif isinstance(env, ConfigEnvironment): + envtype = ':config' + if env.label: + post = f' {env.label}' + else: + raise TDLError(f"invalid environment type: {type(env).__name__}") contents = '\n'.join(format(obj, indent + 2) for obj in env.entries) if contents: contents += '\n' return '{0}:begin {1}{2}.\n{3}{0}:end {1}.'.format( - ' ' * indent, envtype, status, contents) + ' ' * indent, envtype, post, contents) + + +def _format_configentry(obj: ConfigEntry, indent: int) -> str: + values: list[str] = [] + for value in obj.values: + if not re.fullmatch(r'''[^\s!"#$%&'(),.\/:;<=>[\]^|]+''', value): + value = '"' + value.replace('\\', '\\\\').replace('"', '\\"') + '"' + values.append(value) + return '{}{} := {}.'.format(' ' * indent, obj.key, " ".join(values)) def _format_include(fi, indent): diff --git a/delphin/tdl/_model.py b/delphin/tdl/_model.py index 0fc4f4bd..2d5258b0 100644 --- a/delphin/tdl/_model.py +++ b/delphin/tdl/_model.py @@ -1,6 +1,6 @@ from collections.abc import Mapping, Sequence from pathlib import Path -from typing import Optional, Union +from typing import NamedTuple, Optional, Union from delphin import util from delphin.tdl._exceptions import TDLError @@ -863,10 +863,39 @@ class InstanceEnvironment(_Environment): entries (list): TDL entries """ def __init__(self, status, entries=None): - super(InstanceEnvironment, self).__init__(entries) + super().__init__(entries) self.status = status +class ConfigEnvironment(_Environment): + """ + TDL configuration environment. + + Args: + entries (list): config entries + """ + def __init__(self, label: str = '', entries=None): + super().__init__(entries) + self.label = label + + +class ConfigEntry(NamedTuple): + """Key-value pair from a TDL config file. + + Since the type of the value (e.g., an atomic string or a list) + depends on the key, values are always read as a list. For + convenience, the :prop:`value` property returns these values + as a single string joined with space characters. + """ + + key: str + values: list[str] + + @property + def value(self) -> str: + return " ".join(self.values) + + class FileInclude: """ Include other TDL files in the current environment. diff --git a/delphin/tdl/_parse.py b/delphin/tdl/_parse.py index ff9888eb..412860f6 100644 --- a/delphin/tdl/_parse.py +++ b/delphin/tdl/_parse.py @@ -10,6 +10,8 @@ EMPTY_LIST_TYPE, LIST_TYPE, BlockComment, + ConfigEntry, + ConfigEnvironment, Conjunction, ConsList, Coreference, @@ -65,9 +67,9 @@ |\(([^ ]+\s+(?:[^ )\\]|\\.)+)\) # 22 affix subpattern |(\/) # 23 defaults (currently unused) |({identifier}) # 24 identifiers and symbols - |(:begin) # 25 start a :type or :instance block - |(:end) # 26 end a :type or :instance block - |(:type|:instance) # 27 environment type + |(:begin) # 25 start an environment block + |(:end) # 26 end an environment block + |(:type|:instance|:config) # 27 environment type |(:status) # 28 instance status |(:include) # 29 file inclusion |([^\s]) # 30 unexpected @@ -241,7 +243,11 @@ def _parse_tdl(tokens, path): obj = _parse_letterset(token, line_no) yield (obj.__class__.__name__, obj, line_no) elif gid == 24: - obj = _parse_tdl_definition(token, tokens) + # special handling for configs + if isinstance(environment, ConfigEnvironment): + obj = _parse_config_key_val(token, tokens) + else: + obj = _parse_tdl_definition(token, tokens) yield (obj.__class__.__name__, obj, line_no) elif gid == 25: envstack.append(environment) @@ -267,6 +273,26 @@ def _parse_tdl(tokens, path): raise TDLSyntaxError('unexpected end of input.') from None +def _parse_config_key_val(identifier, tokens) -> tuple[str, list[str]]: + _, token, line_no, _ = _shift(tokens) + + if token != ':=': + raise TDLSyntaxError('expected: :=', lineno=line_no) + gid, token, line_no, _ = _shift(tokens) + + values: list[str] = [] + while gid != 10: + if gid not in (4, 24): + raise TDLSyntaxError( + 'expected: a string or a symbol', + lineno=line_no, + ) + values.append(token) + gid, token, line_no, _ = _shift(tokens) + + return ConfigEntry(identifier, values) + + def _parse_tdl_definition(identifier, tokens): gid, token, line_no, nextgid = _shift(tokens) @@ -488,8 +514,16 @@ def _parse_tdl_begin_environment(tokens): raise TDLSyntaxError('expected: :status or .', lineno=lineno) env = InstanceEnvironment(status) - else: + elif envtype == ':type': env = TypeEnvironment() + elif envtype == ':config': + label = "" + if gid == 24: + label = token + gid, token, lineno = tokens.next() + env = ConfigEnvironment(label=label) + else: + raise TDLSyntaxError(f'unexpected environment type: {envtype}') if gid != 10: raise TDLSyntaxError('expected: .', lineno=lineno, text=token) return env @@ -502,6 +536,8 @@ def _parse_tdl_end_environment(tokens, env): elif envtype == ':instance' and not isinstance(env, InstanceEnvironment): raise TDLSyntaxError('expected: :instance', lineno=lineno, text=envtype) + elif envtype == ':config' and not isinstance(env, ConfigEnvironment): + raise TDLSyntaxError('expected: :config', lineno=lineno, text=envtype) gid, _, lineno = tokens.next() if gid != 10: raise TDLSyntaxError('expected: .', lineno=lineno) diff --git a/tests/tdl_test.py b/tests/tdl_test.py index 8af39796..432485b4 100644 --- a/tests/tdl_test.py +++ b/tests/tdl_test.py @@ -8,6 +8,8 @@ from delphin.tdl import ( AVM, BlockComment, + ConfigEntry, + ConfigEnvironment, Conjunction, ConsList, Coreference, @@ -743,7 +745,7 @@ def test_parse_blockcomment(): assert isinstance(bc, BlockComment) -def test_parse_environments(): +def test_parse_empty_environments(): g = _iterparse(':begin :type.\n' ':end :type.') event, e, _ = next(g) @@ -767,6 +769,8 @@ def test_parse_environments(): assert isinstance(e, InstanceEnvironment) assert e.entries == [] + +def test_parse_environments_with_contents(): g = _iterparse(':begin :type.\n' 'a := b & [ ATTR val ].\n' ':end :type.') @@ -796,6 +800,8 @@ def test_parse_environments(): assert e.entries[1].path.name == 'file2.tdl' assert e.entries[1].path.parent.name == 'subdir' + +def test_parse_nested_environment(): g = _iterparse(':begin :type.\n' ' :include "file1.tdl".\n' ' :begin :instance :status lex-rule.\n' @@ -818,6 +824,44 @@ def test_parse_environments(): assert isinstance(e2.entries[0], FileInclude) +def test_config_environment(): + g = _iterparse( + ':begin :config.\n' + ':end :config.' + ) + event, e, _ = next(g) + assert event == 'BeginEnvironment' + assert isinstance(e, ConfigEnvironment) + assert e.label == '' + assert e.entries == [] + assert next(g)[0] == 'EndEnvironment' + + g = _iterparse( + ':begin :config ace.\n' + ':end :config.' + ) + assert next(g)[1].label == 'ace' + + +def test_config_environment_contents(): + def keyval(s: str) -> ConfigEntry: + g = _iterparse(f':begin :config.\n{s}\n:end :config.') + event, e, _ = next(g) + assert event == 'BeginEnvironment' + while event != 'EndEnvironment': + event, _, _ = next(g) + assert len(e.entries) == 1 + return e.entries[0] + + assert keyval('symbol := abc.') == ('symbol', ['abc']) + assert keyval('string := "abc".') == ('string', ['abc']) + assert keyval('list := abc def.') == ('list', ['abc', 'def']) + assert keyval('list2 := "a b c" "d e f".') == ('list2', ['a b c', 'd e f']) + assert keyval('multiline :=\n abc\n def.') == ('multiline', ['abc', 'def']) + # assert keyval(';a comment') == ... + # assert keyval(':begin :type.\n:include "a.tdl".\n:end :type.') == ... + + def test_format_TypeTerms(): assert tdl.format(TypeIdentifier('a-type')) == 'a-type' assert tdl.format(String('a string')) == '"a string"' @@ -977,6 +1021,24 @@ def test_format_environments(): ':end :type.') +def test_format_config_environment(): + e = ConfigEnvironment( + label="test", + entries=[ + LineComment(' a comment'), + ConfigEntry('key', ['value']), + ConfigEntry('key-two', ['a space', 'nospace']), + ] + ) + assert tdl.format(e) == ( + ':begin :config test.\n' + ' ; a comment\n' + ' key := value.\n' + ' key-two := "a space" nospace.\n' + ':end :config.' + ) + + def test_format_fileinclude(): assert tdl.format(FileInclude('foo.tdl')) == ':include "foo.tdl".'