diff --git a/.github/workflows/mathics.yml b/.github/workflows/mathics.yml index 899075c6..17c13e6e 100644 --- a/.github/workflows/mathics.yml +++ b/.github/workflows/mathics.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.11'] + python-version: ['3.12'] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} @@ -33,7 +33,7 @@ jobs: git clone --depth 1 https://github.com/Mathics3/mathics-scanner.git (cd mathics-scanner && pip install -e .) # Until next Mathics3/mathics-core release is out... - git clone --depth 1 https://github.com/Mathics3/mathics-core.git + git clone --depth 1 --branch revise-escape-sequence-scanning https://github.com/Mathics3/mathics-core.git cd mathics-core/ make PIP_INSTALL_OPTS='[full]' # pip install Mathics3[full] diff --git a/docs/source/api.rst b/docs/source/api.rst index 7368a54b..cb02ec89 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -10,7 +10,7 @@ Tokenization Tokenization is performed by the ``Tokeniser`` class. The ``next`` method consumes characters from a feeder and returns a token if the tokenization -succeeds. If the tokenization fails an instance of ``TranslateError`` is +succeeds. If the tokenization fails an instance of ``SyntaxError`` is raised. .. autoclass:: Tokeniser(object) diff --git a/mathics_scanner/__init__.py b/mathics_scanner/__init__.py index 5d9def81..ecfec276 100644 --- a/mathics_scanner/__init__.py +++ b/mathics_scanner/__init__.py @@ -15,9 +15,7 @@ from mathics_scanner.errors import ( IncompleteSyntaxError, InvalidSyntaxError, - ScanError, - TranslateError, - TranslateErrorNew, + SyntaxError, ) from mathics_scanner.feed import ( FileLineFeeder, @@ -36,12 +34,10 @@ "InvalidSyntaxError", "LineFeeder", "MultiLineFeeder", - "ScanError", + "SyntaxError", "SingleLineFeeder", # "Token", # "Tokeniser", - "TranslateError", - "TranslateErrorNew", "__version__", "aliased_characters", # "is_symbol_name", diff --git a/mathics_scanner/data/named-characters.yml b/mathics_scanner/data/named-characters.yml index bcca13c2..e94d6c02 100644 --- a/mathics_scanner/data/named-characters.yml +++ b/mathics_scanner/data/named-characters.yml @@ -68,7 +68,7 @@ # the named character. If it is the same as unicode-equivalent # it should be omitted # -# wl-unicode-name: The name of the character corresponding to `wl-unicode`, if it exists. If it is the same as unicode-equivalent-name it can be omitted. +# wl-unicode-name: The name of the character corresponding to `wl-unicode`, if it exists. # It will mentioned in Wolfram Language docs if it exists. # # Sources: @@ -6628,6 +6628,16 @@ LightBulb: wl-reference: https://reference.wolfram.com/language/ref/character/LightBulb.html wl-unicode: "\uF723" +LineSeparator: + has-unicode-inverse: false + is-letter-like: false + unicode-equivalent: "\u2028" + unicode-equivalent-name: LINE SEPARATOR + unicode-reference: https://www.compart.com/en/unicode/U+2028 + wl-reference: https://reference.wolfram.com/language/ref/character/LineSeparator.html + wl-unicode: "\u2028" + wl-unicode-name: LINE SEPARATOR + LongDash: esc-alias: -- has-unicode-inverse: false diff --git a/mathics_scanner/errors.py b/mathics_scanner/errors.py index 212a82e3..98b9c169 100644 --- a/mathics_scanner/errors.py +++ b/mathics_scanner/errors.py @@ -1,7 +1,13 @@ # -*- coding: utf-8 -*- -class TranslateErrorNew(Exception): +class SyntaxError(Exception): + """Some sort of error in the scanning or tokenization phase parsing Mathics3. + + There are more specific kinds of exceptions subclassed from this + exception class. + """ + def __init__(self, tag: str, *args): super().__init__() self.name = "Syntax" @@ -9,38 +15,25 @@ def __init__(self, tag: str, *args): self.args = args -class TranslateError(Exception): - """ - A generic class of tokenization errors. This exception is subclassed by other - tokenization errors - """ - - -class EscapeSyntaxError(TranslateErrorNew): +class EscapeSyntaxError(SyntaxError): """Escape sequence syntax error""" pass -class IncompleteSyntaxError(TranslateErrorNew): +class IncompleteSyntaxError(SyntaxError): """More characters were expected to form a valid token""" pass -class InvalidSyntaxError(TranslateErrorNew): +class InvalidSyntaxError(SyntaxError): """Invalid syntax""" pass -class NamedCharacterSyntaxError(TranslateError): +class NamedCharacterSyntaxError(EscapeSyntaxError): """Named character syntax error""" pass - - -class ScanError(TranslateErrorNew): - """A generic scanning error""" - - pass diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py new file mode 100644 index 00000000..633b9ec9 --- /dev/null +++ b/mathics_scanner/escape_sequences.py @@ -0,0 +1,148 @@ +""" +Helper Module for tokenizing character escape sequences. +""" + +from typing import Optional, Tuple + +from mathics_scanner.characters import named_characters +from mathics_scanner.errors import ( + EscapeSyntaxError, + NamedCharacterSyntaxError, + SyntaxError, +) + + +def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) -> str: + r""" + See if characters start_shift .. end shift + can be converted to an integer in base ``base``. + + If so, chr(integer value converted from base) is returnd. + + However, if the conversion fails, SyntaxError is raised. + """ + last = end_shift - start_shift + if last == 2: + tag = "sntoct2" + elif last == 3: + assert base == 8, "Only octal requires 3 digits" + tag = "sntoct1" + elif last in (4, 6): + tag = "snthex" + else: + raise ValueError() + + if end_shift > len(source_text): + raise SyntaxError("Syntax", tag) + + assert start_shift <= end_shift + text = source_text[start_shift:end_shift] + try: + result = int(text, base) + except ValueError: + raise SyntaxError(tag, source_text[start_shift:].rstrip("\n")) + + return chr(result) + + +def parse_named_character(source_text: str, start: int, finish: int) -> Optional[str]: + r""" + Find the unicode-equivalent symbol for a string named character. + + Before calling we have matched the text between "\[" and "]" of the input. + + The name character is thus in source_text[start:finish]. + + Match this string with the known named characters, + e.g. "Theta". If we can match this, then we return the unicode equivalent from the + `named_characters` map (which is read in from JSON but stored in a YAML file). + + If we can't find the named character, raise NamedCharacterSyntaxError. + """ + named_character = source_text[start:finish] + if named_character.isalpha(): + char = named_characters.get(named_character) + if char is None: + raise NamedCharacterSyntaxError("sntufn", named_character) + else: + return char + + +def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: + """Given some source text in `source_text` starting at offset + `pos`, return the escape-sequence value for this text and the + follow-on offset position. + """ + result = "" + c = source_text[pos] + if c == "\\": + return "\\", pos + 1 + + # https://www.wolfram.com/language/12/networking-and-system-operations/use-the-full-range-of-unicode-characters.html + # describes hex encoding. + if c == ".": + # see if we have a 2-digit hexadecimal number. + # for example, \.42 is "b" + result += parse_base(source_text, pos + 1, pos + 3, 16) + pos += 3 + elif c == ":": + # see if we have a 4-digit hexadecimal number. + # for example, \:03b8" is unicode small leter theta: θ. + result += parse_base(source_text, pos + 1, pos + 5, 16) + pos += 5 + elif c == "|": + # see if we have a 6-digit hexadecimal number. + result += parse_base(source_text, pos + 1, pos + 7, 16) + pos += 7 + elif c == "[": + pos += 1 + i = pos + 1 + while i < len(source_text): + if source_text[i] == "]": + break + i += 1 + if i == len(source_text): + # Note: named characters do not have \n's in them. (Is this right)? + # FIXME: decide what to do here. + raise NamedCharacterSyntaxError("Syntax", "sntufn", source_text[pos:]) + + named_character = parse_named_character(source_text, pos, i) + if named_character is None: + raise NamedCharacterSyntaxError("Syntax", "sntufn", source_text[pos:i]) + + result += named_character + pos = i + 1 + elif c in "01234567": + # See if we have a 3-digit octal number. + # For example \065 = "5" + result += parse_base(source_text, pos, pos + 3, 8) + pos += 3 + + # WMA escape characters \n, \t, \b, \r. + # Note that these are a similer to Python, but are different. + # In particular, Python defines "\a" to be ^G (control G), + # but in WMA, this is invalid. + elif c in "ntbfr $\n": + if c in "n\n": + result += "\n" + elif c == " ": + result += " " + elif c == "t": + result += "\t" + elif c == "b": + result += "\b" + elif c == "f": + result += "\f" + elif c in '$"': + # I don't know why \$ is defined, but it is! + result += rf"\{c}" + else: + assert c == "r" + result += "\r" + pos += 1 + elif c in '!"': + result += c + pos += 1 + else: + raise EscapeSyntaxError("stresc", rf"\{c}") + return result, pos diff --git a/mathics_scanner/feed.py b/mathics_scanner/feed.py index 8ae0717f..9674ae53 100644 --- a/mathics_scanner/feed.py +++ b/mathics_scanner/feed.py @@ -130,14 +130,14 @@ def empty(self) -> bool: class SingleLineFeeder(LineFeeder): "A feeder that feeds all the code as a single line." - def __init__(self, code: str, filename=""): + def __init__(self, source_text: str, filename=""): """ :param code: The source of the feeder (a string). :param filename: A string that describes the source of the feeder, i.e. the filename that is being feed. """ super().__init__(filename) - self.code = code + self.source_text = source_text self._empty = False def feed(self) -> str: @@ -145,7 +145,7 @@ def feed(self) -> str: return "" self._empty = True self.lineno += 1 - return self.code + return self.source_text def empty(self) -> bool: return self._empty diff --git a/mathics_scanner/mathics3_tokens.py b/mathics_scanner/mathics3_tokens.py index db6a163c..d6b0467a 100644 --- a/mathics_scanner/mathics3_tokens.py +++ b/mathics_scanner/mathics3_tokens.py @@ -10,7 +10,7 @@ from mathics_scanner.errors import ( EscapeSyntaxError, NamedCharacterSyntaxError, - ScanError, + SyntaxError, ) from mathics_scanner.feed import FileLineFeeder, LineFeeder, SingleLineFeeder from mathics_scanner.tokeniser import Tokeniser @@ -162,25 +162,30 @@ def interactive_eval_loop(shell: TerminalShell, code_tokenize_format: bool): try: source_text = shell.feed() tokens(source_text, code_tokenize_format) - except ScanError: - shell.errmsg( - "Syntax", - "sntxi", - "Expression error", - ) - pass except NamedCharacterSyntaxError: shell.errmsg( "Syntax", "sntufn", "Unknown unicode longname", ) + # This has to come after NamedCharacterSyntaxError + # since that is a subclass EscapeSyntaxError except EscapeSyntaxError: shell.errmsg( "Syntax", "sntufn", "Unknown unicode longname", ) + # This has to come after NamedCharacterSyntaxError and + # EscapeSyntaxError since those are subclasses of + # SyntaxError + except SyntaxError: + shell.errmsg( + "Syntax", + "sntxi", + "Expression error", + ) + pass except KeyboardInterrupt: print("\nKeyboardInterrupt. Type Ctrl-D (EOF) to exit.") except EOFError: @@ -199,7 +204,7 @@ def tokens(code, code_tokenize_format: bool): while True: try: token = tokeniser.next() - except ScanError as scan_error: + except SyntaxError as scan_error: mess = "" if scan_error.tag == "sntoct1": mess = r"3 octal digits are required after \ to construct an 8-bit character" diff --git a/mathics_scanner/prescanner.py b/mathics_scanner/prescanner.py deleted file mode 100644 index 08f56346..00000000 --- a/mathics_scanner/prescanner.py +++ /dev/null @@ -1,206 +0,0 @@ -# -*- coding: utf-8 -*- - -# Note: this module will be remove or rewritten drastically in the near future. -""" -Module for "prescanning". Right now this just means replacing -character escape sequences. -""" - -from typing import List - -from mathics_scanner.characters import named_characters -from mathics_scanner.errors import IncompleteSyntaxError, ScanError -from mathics_scanner.feed import LineFeeder - - -class Prescanner(object): - r""" - A Class for converting escape sequences: - Character codes to characters: - \.7A -> z - \.004a -> J - \:004a -> J - \|01D451 -> \U0001D451 - \041 -> ! - Named Characters to Unicode: - \[Theta] -> \u03B8 - ASCII escape sequence: - \n -> literal \n - - Trailing backslash characters (\) are reported incomplete. - """ - - def __init__(self, feeder: LineFeeder): - # self.feeder is a function that returns the next line of the Mathics input - self.feeder = feeder - - # self.input_line is the result of reading the next Mathics input line - self.input_line: str = feeder.feed() - - # self.pos is current position within self.input_line. - self.pos = 0 - - def feed(self) -> str: - """ - Return the next line of Mathics input - """ - return self.feeder.feed() - - def get_more_input(self): - "Get another source-text line from input and continue." - - line: str = self.feed() - if not line: - text = self.input_line[self.pos :].rstrip() - self.feeder.message("Syntax", "sntxi", text) - raise IncompleteSyntaxError("Syntax", "sntxi", text) - self.input_line += line - - def replace_escape_sequences(self) -> str: - """ - Replace escape sequences in ``self.input_line``. The replacement string is returned. - Note: ``self.input_line`` is not modified. - """ - - # Line fragments to be joined before returning from this method. - line_fragments: List[str] = [] - - # Fragment start position of line fragment under consideration. - self.fragment_start = self.pos - - def start_new_fragment(pos: int) -> None: - """ - Update position markers to start a new line fragment at ``pos``. - """ - self.pos = pos - self.fragment_start = pos - - def try_parse_base(start_shift: int, end_shift: int, base: int) -> None: - r""" - See if characters self.pos+start_shift .. self.pos+end shift - can be converted to an integer in base ``base``. - - If so, we append the characters before the escape sequence without the - escaping characters like ``\.`` or ``\:``. - - We also append the converted integer to ``line_fragments``, and update - position cursors for a new line fragment. - - However, if the conversion fails, then error messages are - issued and nothing is updated - """ - start, end = self.pos + start_shift, self.pos + end_shift - result = None - if end <= len(self.input_line): - text = self.input_line[start:end] - try: - result = int(text, base) - except ValueError: - pass # result remains None - if result is None: - last = end - start - if last == 2: - self.feeder.message("Syntax", "sntoct2") - elif last == 3: - self.feeder.message("Syntax", "sntoct1") - elif last == 4: - self.feeder.message("Syntax", "snthex") - else: - raise ValueError() - self.feeder.message( - "Syntax", "sntxb", self.input_line[self.pos :].rstrip("\n") - ) - raise ScanError("Syntax", "sntxb") - - # Add text from prior line fragment as well - # as the escape sequence, a character, from the escape sequence - # that was just matched. - line_fragments.append(self.input_line[start : self.pos]) - line_fragments.append(chr(result)) - - # Set up a new line fragment for the next time we are called. - start_new_fragment(end) - - def try_parse_named_character(start_shift: int): - r"""Before calling we have matched "\[". Scan to the remaining "]" and - try to match what is found in-between with a known named - character, e.g. "Theta". If we can match this, we store - the unicode character equivalent in ``line_fragments``. - If we can't find a named character, error messages are - issued and we leave ``line_fragments`` untouched. - """ - i = self.pos + start_shift - while True: - if i == len(self.input_line): - self.get_more_input() - if self.input_line[i] == "]": - break - i += 1 - - named_character = self.input_line[self.pos + start_shift : i] - if named_character.isalpha(): - char = named_characters.get(named_character) - if char is None: - self.feeder.message("Syntax", "sntufn", named_character) - # stay in same line fragment - else: - # Add text from prior line fragment as well - # as the escape sequence, a character, from the escape sequence - # just matched. - line_fragments.append( - self.input_line[self.fragment_start : self.pos] - ) - line_fragments.append(char) - start_new_fragment(i + 1) - - # Stay in same line fragment, but advance the cursor position. - self.pos = i + 1 - - # In the following loop, we look for and replace escape - # sequences. The current character under consideration is at - # self.code[self.pos]. When an escape sequence is found at - # that position, the previous line_fragment is extracted and - # stored in ``line_fragments``. The start-position marker for the - # next line_fragment is started and self.pos is updated. - - while self.pos < len(self.input_line): - if self.input_line[self.pos] == "\\": - # Look for and handle an escape sequence. - if self.pos + 1 == len(self.input_line): - self.get_more_input() - c = self.input_line[self.pos + 1] - if c == "|": - try_parse_base(2, 8, 16) - if c == ".": - # See if we have a two-digit hexadecimal number. - try_parse_base(2, 4, 16) - elif c == ":": - # See if we have a four-digit hexadecimal number. - try_parse_base(2, 6, 16) - elif c == "[": - try_parse_named_character(2) - elif c in "01234567": - # See if we have an octal number. - try_parse_base(1, 4, 8) - elif c == "\n": - if self.pos + 2 == len(self.input_line): - self.get_more_input() - line_fragments.append( - self.input_line[self.fragment_start : self.pos] - ) - start_new_fragment(self.pos + 2) - else: - # Two backslashes in succession indicates a single - # backslash character. Advance the scanning - # cursor (self.pos) over both backslashes. Also, - # Python's backslash escape mechanism turns the - # two backslashes into one in length calculations. - self.pos += 2 - else: - self.pos += 1 - - # Add the final line fragment. - line_fragments.append(self.input_line[self.fragment_start :]) - - # Produce and return the input line with escape-sequences replaced - return "".join(line_fragments) diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 06ade1fc..1698ad4c 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -5,31 +5,62 @@ See classes `Token` and `Tokeniser` . """ +import itertools import os.path as osp import re import string -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Set, Tuple from mathics_scanner.characters import _letterlikes, _letters -from mathics_scanner.errors import IncompleteSyntaxError, ScanError -from mathics_scanner.prescanner import Prescanner +from mathics_scanner.errors import ( + EscapeSyntaxError, + IncompleteSyntaxError, + InvalidSyntaxError, + NamedCharacterSyntaxError, + SyntaxError, +) +from mathics_scanner.escape_sequences import parse_escape_sequence try: import ujson except ImportError: import json as ujson # type: ignore[no-redef] - -OPERATOR_DATA = {} +# Where we get operator data... ROOT_DIR = osp.dirname(__file__) OPERATORS_TABLE_PATH = osp.join(ROOT_DIR, "data", "operators.json") + +############################################## +# The below get initialized in by init_module() +# from operator data +############################################## +OPERATOR_DATA = {} NO_MEANING_OPERATORS = {} +# String of the final character of a "box-operators" value, +# This is used in t_String for escape-sequence handling. +# The below is roughly correct, but we overwrite this +# from operators.json data in init_module() +BOXING_CONSTRUCT_SUFFIXES: Set[str] = { + "%", + "/", + "@", + "+", + "_", + "&", + "!", + "^", + "`", + "*", + "(", + ")", +} + FILENAME_TOKENS: List = [] TOKENS: List[Tuple] = [] TOKEN_INDICES: Dict = {} - +############################################## # special patterns NUMBER_PATTERN = r""" ( (?# Two possible forms depending on whether base is specified) @@ -56,7 +87,7 @@ # # This could still be done, but it would need to be integrated more # properly into the tokenization phase which takes into account -# differents states or "modes" indicating the interior of comments, +# different states or "modes" indicating the interior of comments, # strings, files, and Box-like constructs. # The leading character of a Symbol: @@ -166,6 +197,17 @@ def init_module(): with open(osp.join(OPERATORS_TABLE_PATH), "r", encoding="utf8") as operator_f: OPERATOR_DATA.update(ujson.load(operator_f)) + global BOXING_CONSTRUCT_SUFFIXES + + BOXING_CONSTRUCT_SUFFIXES = set( + [ + op_str[-1] + for op_str in itertools.chain.from_iterable( + OPERATOR_DATA["box-operators"].values() + ) + ] + ) | set(["*", ")", "("]) + global NO_MEANING_OPERATORS NO_MEANING_OPERATORS = ( set(OPERATOR_DATA["no-meaning-infix-operators"].keys()) @@ -453,7 +495,9 @@ def is_symbol_name(text: str) -> bool: class Token: """A representation of a Wolfram-Language token. - Tokens are parsed by parser uses to build M-expressions. + A Token is the next level of parsing abstraction above a raw + Mathics3 input string. A sequence of tokens is the input for the + Mathics3 parser. A token has a `tag`, the class or type of the token. For example: a Number, Symbol, String, File, etc. @@ -461,16 +505,10 @@ class Token: The token's `text` is the string contents of the token. The token's `pos` is the integer starting offset where - `text` can be found inside the input string. The input string - is not part of the token though. + `text` can be found inside the full input string. """ def __init__(self, tag: str, text: str, pos: int): - """ - :param tag: which type of token this is. - :param text: The actual contents of the token. - :param pos: The position of the token in the input feed. - """ self.tag = tag self.text = text self.pos = pos @@ -517,8 +555,8 @@ def __init__(self, feeder): ) self.pos: int = 0 self.feeder = feeder - self.prescanner = Prescanner(feeder) - self.source_text = self.prescanner.replace_escape_sequences() + self.source_text = self.feeder.feed() + self.mode: str = "invalid" # Set to True when inside box parsing. @@ -558,19 +596,29 @@ def is_inside_box(self) -> bool: def is_inside_box(self, value: bool) -> None: self._is_inside_box = value - def sntx_message(self, pos: Optional[int] = None) -> Tuple[str, str, str]: - """ - Send a "sntx{b,f} error message to the input-reading feeder. + def sntx_message(self, start_pos: Optional[int] = None) -> Tuple[str, int, int]: + """Send a "sntx{b,f} error message to the input-reading + feeder. + + The tag ("sntxb" or "sntxf"), position of the error, and blank-stripped + position to the end line are returned. """ - if pos is None: - pos = self.pos - pre, post = self.source_text[:pos], self.source_text[pos:].rstrip("\n") - if pos == 0: - self.feeder.message("Syntax", "sntxb", post) - return "sntxb", "", post + if start_pos is None: + start_pos = self.pos + trailing_fragment = self.source_text[start_pos:].strip() + end_pos = start_pos + len(trailing_fragment) + if start_pos == 0: + self.feeder.message("Syntax", "sntxb", trailing_fragment) + tag = "sntxb" else: - self.feeder.message("Syntax", "sntxf", pre, post) - return "sntxf", pre, post + self.feeder.message( + "Syntax", + "sntxf", + self.source_text[:start_pos].strip(), + trailing_fragment, + ) + tag = "syntx" + return tag, start_pos, end_pos # TODO: If this is converted this to __next__, then # a tokeniser object is iterable. @@ -578,7 +626,8 @@ def next(self) -> Token: "Returns the next token from self.source_text." self._skip_blank() source_text = self.source_text - if self.pos >= len(self.source_text): + + if self.pos >= len(source_text): return Token("END", "", len(source_text)) # Look for a matching pattern. @@ -600,7 +649,7 @@ def next(self) -> Token: # No matching pattern found. if pattern_match is None: tag, pre_str, post_str = self.sntx_message() - raise ScanError(tag, pre_str, post_str) + raise SyntaxError(tag, pre_str, post_str) # Look for custom tokenization rules; those are defined with t_tag. override = getattr(self, "t_" + tag, None) @@ -611,6 +660,60 @@ def next(self) -> Token: # pattern match. text = pattern_match.group(0) self.pos = pattern_match.end(0) + + # The below similar to what we do in t_RawBackslash, but is is + # different. First, we need to look for a closing quote + # ("). Also, after parsing escape sequences, we can + # unconditionallhy add them on to the string. That is, we + # don't have to check whether the returned string can be valid + # in a Symbol name. + + if tag == "Symbol": + # We have to keep searching for the end of the Symbol if + # the next symbol is a backslash, "\", because it might be a + # named-letterlike character such as \[Mu] or a escape representation of number or + # character. + # abc\[Mu] is a valid 4-character Symbol. And we can have things like + # abc\[Mu]\[Mu]def\[Mu]1 + while True: + if self.pos >= len(source_text): + break + + # Try to extend symbol with non-escaped alphanumeric + # (and letterlike) symbols. + + # TODO: Do we need to add context breaks? And if so, + # do we need to check for consecutive ``'s? + alphanumeric_match = re.match( + f"[0-9${symbol_first_letter}]+", self.source_text[self.pos :] + ) + if alphanumeric_match is not None: + extension_str = alphanumeric_match.group(0) + text += extension_str + self.pos += len(extension_str) + + if source_text[self.pos] != "\\": + break + + try: + escape_str, next_pos = parse_escape_sequence( + self.source_text, self.pos + 1 + ) + except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error: + if self.is_inside_box: + # Follow-on symbol may be a escape character that can + # appear only in box constructs, e.g. \%. + break + self.feeder.message( + escape_error.name, escape_error.tag, *escape_error.args + ) + raise + if escape_str in _letterlikes: + text += escape_str + self.pos = next_pos + else: + break + return Token(tag, text, pattern_match.start(0)) def _skip_blank(self): @@ -619,23 +722,7 @@ def _skip_blank(self): while True: if self.pos >= len(self.source_text): if comment: - try: - self.get_more_input() - except ValueError: - # `get_more_input` tries to parse substrings like `\|AAAAA` - # that can be interpreted as a character reference. - # To do that, it tries to get the - # new line using the method - # `Prescanner.replace_escape_sequences()` - # Inside a comment, the special meaning of escape sequences - # like `\|` should not be taken into account. - # - # In case of error, just let's pick the code - # from the `input_line` attribute of - # prescanner: - self.source_text = self.prescanner.input_line - # TODO: handle the corner case where the rest of the line - # include escaped sequences, out of the comment. + self.get_more_input() else: break if comment: @@ -695,39 +782,187 @@ def t_PutAppend(self, pattern_match: re.Match) -> Token: "Scan for a ``PutAppend`` token and return that" return self._token_mode(pattern_match, "PutAppend", "filename") - def t_String(self, _: re.Match) -> Token: + def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: + r"""Break out from ``pattern_match`` tokens which start with a backslash, '\'.""" + source_text = self.source_text + start_pos = self.pos + 1 + named_character = "" + if start_pos == len(source_text): + # We have reached end of the input line before seeing a termination + # of backslash. Fetch another line. + self.get_more_input() + self.pos += 1 + source_text += self.source_text + + try: + escape_str, self.pos = parse_escape_sequence(source_text, start_pos) + if source_text[start_pos] == "[" and source_text[self.pos - 1] == "]": + named_character = source_text[start_pos + 1 : self.pos - 1] + except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error: + self.feeder.message(escape_error.name, escape_error.tag, *escape_error.args) + raise + + # Is there a way to DRY with "next()? + if named_character != "": + if named_character in NO_MEANING_OPERATORS: + return Token(named_character, escape_str, start_pos - 1) + + # Look for a pattern matching leading context \. + + indices = self.token_indices.get(escape_str[0], ()) + pattern_match = None + tag = "??invalid" + if indices: + for index in indices: + tag, pattern = self.tokens[index] + pattern_match = pattern.match(escape_str, 0) + if pattern_match is not None: + break + else: + for tag, pattern in self.tokens: + pattern_match = pattern.match(escape_str, 0) + if pattern_match is not None: + break + + # No matching found. + if pattern_match is None: + tag, pre, post = self.sntx_message() + raise SyntaxError(tag, pre, post) + + text = pattern_match.group(0) + start_pos = pattern_match.start(0) + + # Is there a way to DRY with t_String?" + # See t_String for differences. + + if tag == "Symbol": + # FIXME: DRY with code in next() + # We have to keep searching for the end of the Symbol + # after an escaped letterlike-symbol. For example, \[Mu] + # is a valid Symbol. But we can also have symbols for + # \[Mu]\[Theta], \[Mu]1, \[Mu]1a, \[Mu]\.42, \[Mu]\061, or \[Mu]\061abc + while True: + if self.pos >= len(source_text): + break + + # Try to extend symbol with non-escaped alphanumeric + # (and letterlike) symbols. + + # TODO: Do we need to add context breaks? And if so, + # do we need to check for consecutive ``'s? + alphanumeric_match = re.match( + f"[0-9${symbol_first_letter}]+", source_text[self.pos :] + ) + if alphanumeric_match is not None: + extension_str = alphanumeric_match.group(0) + text += extension_str + self.pos += len(extension_str) + + if source_text[self.pos] != "\\": + break + + try: + escape_str, next_pos = parse_escape_sequence( + self.source_text, self.pos + 1 + ) + except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error: + if self.is_inside_box: + # Follow-on symbol may be a escape character that can + # appear only in box constructs, e.g. \%. + break + self.feeder.message( + escape_error.name, escape_error.tag, escape_error.args + ) + raise + if re.match(interior_symbol_pattern, escape_str): + text += escape_str + self.pos = next_pos + else: + break + + elif tag == "String": + self.feeder.message("Syntax", "sntxi", text) + raise InvalidSyntaxError("Syntax", "sntxi", text) + + return Token(tag, text, start_pos) + + def t_String(self, _: Optional[re.Match]) -> Token: """Break out from self.source_text the next token which is expected to be a String. The string value of the returned token will have double quote (") in the first and last - postions of the returned string. + positions of the returned string. """ - start, end = self.pos, None + end = None self.pos += 1 # skip opening '"' newlines = [] + source_text = self.source_text + result = "" + + # The below similar to what we do in t_RawBackslash, but is is + # different. First, we need to look for a closing quote + # ("). Also, after parsing escape sequences, we can + # unconditionally add them on to the string. That is, we + # don't have to check whether the returned string can be valid + # in a Symbol name or as a boxing construct + while True: - if self.pos >= len(self.source_text): + if self.pos >= len(source_text): if end is None: # reached end while still inside string self.get_more_input() newlines.append(self.pos) + source_text = self.source_text else: break - char = self.source_text[self.pos] + char = source_text[self.pos] if char == '"': self.pos += 1 end = self.pos break if char == "\\": - self.pos += 2 + if self.pos + 1 == len(source_text): + # We have reached end of the input line before seeing a terminating + # quote ("). Fetch another line. + self.get_more_input() + self.pos += 1 + try: + escape_str, self.pos = parse_escape_sequence(source_text, self.pos) + except NamedCharacterSyntaxError as escape_error: + self.feeder.message( + escape_error.name, escape_error.tag, *escape_error.args + ) + raise + + # This has to come after NamedCharacterSyntaxError since + # that is a subclass of this. + except EscapeSyntaxError as escape_error: + escaped_char = self.source_text[self.pos] + if escaped_char in BOXING_CONSTRUCT_SUFFIXES: + # If there is boxing construct matched, we + # preserve what was given, but do not tokenize + # the construct. "\(" remains "\(" and is not + # turned into IntepretBox". + result += "\\" + escaped_char + self.pos += 1 + else: + # Not something that can be a boxing construct. + # So here, we'll report an error as we do with + # NamedCharacterSyntaxError. + self.feeder.message( + escape_error.name, escape_error.tag, *escape_error.args + ) + raise + + else: + result += escape_str else: + result += self.source_text[self.pos] self.pos += 1 - indices = [start] + newlines + [end] - result = "".join( - self.source_text[indices[i] : indices[i + 1]] - for i in range(len(indices) - 1) - ) - return Token("String", result, start) + # FIXME: rethink whether we really need quotes at the beginning and + # and of a string and redo. This will include revising whatever calls + # parser.unescape string(). + return Token("String", f'"{result}"', self.pos) # Call the function that initializes the dictionaries. diff --git a/test/test_escape_sequences.py b/test/test_escape_sequences.py new file mode 100644 index 00000000..2c0726c8 --- /dev/null +++ b/test/test_escape_sequences.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- +import pytest + +from mathics_scanner.errors import NamedCharacterSyntaxError, SyntaxError +from mathics_scanner.escape_sequences import parse_escape_sequence + + +def test_escape_sequences(): + for text, pos, expect_pos, expect_str, fail_msg in ( + # Backslash + ("\\\\", 0, 1, "\\", "backslash"), + ("abc \\\\", 5, 6, "\\", "backslash at end"), + ("abc \\\\n", 5, 6, "\\", "backslash in middle"), + ( + r"\ +abc", + 1, + 2, + "\n", + "backslashed at end of line", + ), + # Octal + (r"050", 0, 3, chr(0o50), "character at beginning"), + (r"a\051", 2, 5, chr(0o51), "Octal character in middle"), + # With dot (2-character hex) + (r".30", 0, 3, chr(0x30), "two-character hex"), + ( + r"a\.3115", + 2, + 5, + chr(0x31), + "two-character hex in middle with trailing digits", + ), + (r"b\.4dXYZ", 2, 5, chr(0x4D), "two-character hex in middle"), + # With colon (4-character hex) + (r":0030", 0, 5, "0", "four-character hex"), + (r":03B5", 0, 5, "\u03B5", "four-character hex unicode uppercase"), + (r":03B8", 0, 5, "\u03b8", "four-character hex unicode lowercase"), + # With Vertical bar (6-character hex) + (r"|01d450", 0, 7, "\U0001D450", "six-character hex unicode lowercase"), + (r"|01D451", 0, 7, "\U0001D451", "six-character hex unicode uppercase"), + # Named Characters + ("[Theta]", 0, 7, "\u03B8", "Named character; full string"), + ("abcd[CapitalPi]efg", 4, 15, "\u03A0", "Named character; internal"), + (r"z \[Conjugate]", 3, 14, "\uF3C8", "Named character; at end"), + ("[Integral]", 0, 10, "\u222b", "Another full-string named-character"), + ): + assert parse_escape_sequence(text, pos) == (expect_str, expect_pos), fail_msg + + +def test_invalid_named_character_sequences(): + for text in (r"\[", r"\[Theta", r"\[Fake]", r"\[abc]"): + with pytest.raises(NamedCharacterSyntaxError): + parse_escape_sequence(text, 1) + + +def test_invalid_number_encoding(): + for text in ( + # Octal + "093", # 9 is not in 0-7 + "01", # need 3 characters + "01", # need 3 characters + # 2-character hex + ".", + ".0", + ".0i", # i is not in 0-f + # 4-character hex + ":", + ":A", + ":A1", + ":ak", + ":A10", + ":a1g", + ":A1g9", + ":01-2", + ): + with pytest.raises(SyntaxError): + parse_escape_sequence(text, 0) diff --git a/test/test_prescanner.py b/test/test_prescanner.py deleted file mode 100644 index 2f153d09..00000000 --- a/test/test_prescanner.py +++ /dev/null @@ -1,105 +0,0 @@ -# -*- coding: utf-8 -*- -import pytest - -from mathics_scanner import IncompleteSyntaxError, ScanError -from mathics_scanner.prescanner import Prescanner -from mathics_scanner.feed import SingleLineFeeder - - -def replace_escape_sequences(mathics_text: str): - prescanner = Prescanner(SingleLineFeeder(mathics_text)) - return prescanner.replace_escape_sequences() - - -def assert_invalid(mathics_text: str): - with pytest.raises(ScanError): - replace_escape_sequences(mathics_text) - - -def assert_incomplete(mathics_text: str): - with pytest.raises(IncompleteSyntaxError): - replace_escape_sequences(mathics_text) - - -def assert_equal(mathics_text: str, result: str): - assert replace_escape_sequences(mathics_text) == result - - -def assert_equal_length(mathics_text: str, length): - assert len(replace_escape_sequences(mathics_text)) == length - - -def test_named_characters(): - assert_equal(r"\[Theta]", "\u03B8") - assert_equal(r"\[CapitalPi]", "\u03A0") - assert_equal(r"\[Fake]", r"\[Fake]") - assert_equal("z \\[Conjugate]", "z \uF3C8") - assert_equal("z \\[Integral]", "z \u222b") - assert_equal("z \\\\[Integral]", "z \\\\[Integral]") - assert_equal("z \\\\\\[Integral]", "z \\\\\u222b") - assert_equal("abc\\\\", "abc\\\\") - - -def test_text_lengths(): - assert_equal_length(r'"\[Integral]"', 3) - # Prescanner keep both slashes and quotes. - # The tokenizer brings \\ into \ if it appears - # inside a string. - assert_equal_length(r'"\\[Integral]"', 14) - - -def test_oct(): - assert_equal(r"\051", ")") - - -def test_hex_dot(): - assert_equal(r"\.30", "0") - - -def test_hex_colon(): - assert_equal(r"\:0030", "0") - assert_equal(r"\:03B8", "\u03B8") - assert_equal(r"\:03b8", "\u03B8") - - -def test_hex_vbar(): - assert_equal(r"\|01D451", "\U0001D451") - - -def test_incomplete(): - assert_incomplete(r"\[") - assert_incomplete(r"\[Theta") - - -def test_invalid_octal(): - assert_invalid(r"\093") - assert_invalid(r"\01") - - -def test_invalid_colon(): - assert_invalid(r"\:") - assert_invalid(r"\:A") - assert_invalid(r"\:01") - assert_invalid(r"\:A1") - assert_invalid(r"\:ak") - assert_invalid(r"\:A10") - assert_invalid(r"\:a1g") - assert_invalid(r"\:A1g9") - assert_invalid(r"\:01-2") - - -def test_invalid_dot(): - assert_invalid(r"\.") - assert_invalid(r"\.0") - - -def test_combined(): - assert_equal(r"\:03B8\[Theta]\.30\052", "\u03B8\u03B80*") - - -def test_nested(): - assert_equal(r"\[Thet\141]", r"\[Thet\141]") - - -def test_trailing_backslash(): - assert_incomplete("x \\") diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py index 0cbd5b87..84660148 100644 --- a/test/test_string_tokens.py +++ b/test/test_string_tokens.py @@ -3,18 +3,32 @@ Tests translation from text characters to the token: String """ +from typing import Optional + import pytest -from mathics_scanner.errors import IncompleteSyntaxError, ScanError +from mathics_scanner.errors import EscapeSyntaxError, IncompleteSyntaxError from mathics_scanner.feed import SingleLineFeeder from mathics_scanner.tokeniser import Token, Tokeniser -def check_string(source_text, expected_text: str): +def check_string( + source_text, + expected_text: str, + message: Optional[str] = "", + expected_tag: Optional[str] = None, +): token = single_token(source_text) assert token is not None - assert token.tag == "String" - assert token.text == expected_text + + if expected_tag is None: + expected_tag = "String" + assert token.tag == expected_tag + + if message: + assert token.text == expected_text, message + else: + assert token.text == expected_text def incomplete_error(s: str, failure_msg: str): @@ -24,14 +38,14 @@ def incomplete_error(s: str, failure_msg: str): assert excinfo, failure_msg -def scan_error(s: str, failure_msg: str): - with pytest.raises(ScanError) as excinfo: +def escape_scan_error(s: str, failure_msg: str): + with pytest.raises(EscapeSyntaxError) as excinfo: get_tokens(s) assert excinfo, failure_msg -def single_token(source_text) -> Token: +def single_token(source_text: str) -> Token: tokens = get_tokens(source_text) assert len(tokens) == 1 token = tokens[0] @@ -51,17 +65,72 @@ def get_tokens(source_text: str): def test_string(): + # Plain strings + check_string('""', '""', "Null string") + check_string('"abc"', '"abc"', "Simple sequence") + + # Number conversions for binary, octal, hexadecimal + check_string(r'"a\\b"', r'"a\b"', "escaped backslash in a string") + check_string(r'"\102"', '"B"', "Octal number test in a string") + check_string(r'"q\.b4"', '"q´"', "2-digit hexadecimal number in a string") + + check_string(r'"\\c"', '"\\c"', "escaped backslash at beginning of string") + + # All valid ASCII-like control escape sequences for escape_string in ("\b", "\f", "\n", "\r", "\t"): check_string(f'"a{escape_string}"', f'"a{escape_string}"') - # Broken: - # "a\050", "a\051" "a\052" - # Prescanning eagerly replaces the escape sequences with - # symbols "(", ")", or "*" respectively and this messes up parsing - # somehow. - check_string(r'"abc"', r'"abc"') + check_string(r'"\ abc"', '" abc"', "Escaped space in a string is valid") check_string(r'"abc(*def*)"', r'"abc(*def*)"') - check_string(r'"a\"b\\c"', r'"a\"b\\c"') + + check_string( + r'"\(a \+\)"', + r'"\(a \+\)"', + "Do not interpret, but preserve boxing inside a string", + ) + incomplete_error(r'"abc', "String does not have terminating quote") incomplete_error(r'"\"', "Unterminated escape sequence") - # scan_error(r'"a\X"', '"X" is not a valid escape character') + + escape_scan_error(r'"a\g"', "Unknown string escape \\g") + escape_scan_error(r'"a\X"', '"X" is not a valid escape character') + + +# https://www.wolfram.com/language/12/networking-and-system-operations/use-the-full-range-of-unicode-characters.html +# describes hex encoding. + + +def test_octal(): + check_string(r'"a\050"', r'"a("', "Octal '(' in string") + check_string(r'"a\051"', r'"a)"', "Octal ')' in string") + check_string(r'"a\052"', r'"a*"', "Octal '*' in string") + + +def test_hexadecimal_dot(): + check_string(r'"\.30"', '"0"', "2-digit hexadecimal ASCII number 0") + check_string(r'"\.42"', '"B"', "2-digit hexadecimal ASCII capital B") + check_string( + r"\.42\.30", + "B0", + "hexademimal encoding of identifier in expression context", + "Symbol", + ) + + +def test_hexadecimal_colon(): + check_string( + r'"\:03B8"', + '"θ"', + "4-digit hexadecimal number test with uppercase alpha letter", + ) + check_string(r'"\:0030"', '"0"') + check_string( + r"\:03b8", + "\u03b8", + "4-digit hexadecimal number test with lowercase alpha letter", + "Symbol", + ) + + +def test_hexadecimal_vbar(): + check_string(r'"\|01D451"', '"\U0001d451"') diff --git a/test/test_tokeniser.py b/test/test_tokeniser.py index 6e336c00..46a96cb6 100644 --- a/test/test_tokeniser.py +++ b/test/test_tokeniser.py @@ -9,7 +9,12 @@ import pytest -from mathics_scanner.errors import IncompleteSyntaxError, InvalidSyntaxError, ScanError +from mathics_scanner.errors import ( + EscapeSyntaxError, + IncompleteSyntaxError, + InvalidSyntaxError, + SyntaxError, +) from mathics_scanner.feed import SingleLineFeeder from mathics_scanner.tokeniser import Token, Tokeniser, is_symbol_name @@ -24,6 +29,11 @@ def check_symbol(source_code: str): assert token, Token("Symbol", source_code, 0) +def escape_syntax_error(error_message: str): + with pytest.raises(EscapeSyntaxError): + tokens(error_message) + + def incomplete_error(error_message: str): with pytest.raises(IncompleteSyntaxError): tokens(error_message) @@ -34,8 +44,8 @@ def invalid_error(error_message: str): tokens(error_message) -def scan_error(error_message): - with pytest.raises(ScanError): +def scanner_error(error_message): + with pytest.raises(SyntaxError): tokens(error_message) @@ -62,6 +72,13 @@ def tokens(source_code) -> List[Token]: return tokens +def test_accuracy(): + scanner_error("1.5``") + check_number("1.0``20") + check_number("1.0``0") + check_number("1.4``-20") + + def test_apply(): assert tokens("f // x") == [ Token("Symbol", "f", 0), @@ -91,10 +108,7 @@ def test_association(): def test_backslash(): - assert tokens("\\[Backslash]") == [Token("Backslash", "\u2216", 0)] - - assert tokens("\\ a") == [Token("RawBackslash", "\\", 0), Token("Symbol", "a", 2)] - + assert tokens(r"\[Backslash]") == [Token("Backslash", "\u2216", 0)] incomplete_error("\\") @@ -106,6 +120,30 @@ def test_boxes(): ] +def test_comments(): + assert tokens("(**)") == [], "empty comment" + assert tokens("(**)1") == [ + Token("Number", "1", 4) + ], "empty comment with trailing text" + assert tokens("1(*2*)") == [ + Token("Number", "1", 0) + ], "empty comment with leading text" + assert tokens("1 (*2*)") == [ + Token("Number", "1", 0) + ], "empty comment with leading text and space" + assert tokens("(* A (* nested comment *) *)") == [], "A nested comment" + assert tokens(r"(* A \[theta] *)") == [], "Comment with valid escape sequence" + assert tokens(r"(* A \[unknown] *)") == [], "Comment with invalid escape sequence" + + +def test_function(): + assert tokens("x&") == [Token("Symbol", "x", 0), Token("Function", "&", 1)] + assert tokens("x\uf4a1") == [ + Token("Symbol", "x", 0), + Token("Function", "\uf4a1", 1), + ] + + def test_information(): assert tokens("??Sin") == [Token("Information", "??", 0), Token("Symbol", "Sin", 2)] @@ -122,8 +160,8 @@ def test_int_repeated(): def test_integeral(): - assert tokens("\u222B x \uf74c y") == [ - Token("Integral", "\u222B", 0), + assert tokens("\u222b x \uf74c y") == [ + Token("Integral", "\u222b", 0), Token("Symbol", "x", 2), Token("DifferentialD", "\uf74c", 4), Token("Symbol", "y", 6), @@ -135,13 +173,6 @@ def test_is_symbol(): assert not is_symbol_name("98") # symbols can't start with numbers -def test_accuracy(): - scan_error("1.5``") - check_number("1.0``20") - check_number("1.0``0") - check_number("1.4``-20") - - def test_number(): assert tags("1.5") == ["Number"] assert tags("1.5*^10") == ["Number"] @@ -220,11 +251,3 @@ def test_unset(): assert tokens("= .") == [Token("Unset", "= .", 0)] assert tokens("=.5") == [Token("Set", "=", 0), Token("Number", ".5", 1)] assert tokens("= ..") == [Token("Set", "=", 0), Token("Repeated", "..", 2)] - - -def test_function(): - assert tokens("x&") == [Token("Symbol", "x", 0), Token("Function", "&", 1)] - assert tokens("x\uf4a1") == [ - Token("Symbol", "x", 0), - Token("Function", "\uf4a1", 1), - ]