From 44c6e058308c1e28eb0c7c88322004bc19a0cf74 Mon Sep 17 00:00:00 2001 From: rocky Date: Fri, 11 Apr 2025 15:36:18 -0400 Subject: [PATCH 01/34] Reduce prescanner use Escape sequences other than named characters have been removed from the prescanner and put in the scanner. --- mathics_scanner/prescanner.py | 206 ---------------------------------- mathics_scanner/tokeniser.py | 118 ++++++++++++++++++- test/test_string_tokens.py | 33 ++++-- 3 files changed, 140 insertions(+), 217 deletions(-) delete mode 100644 mathics_scanner/prescanner.py diff --git a/mathics_scanner/prescanner.py b/mathics_scanner/prescanner.py deleted file mode 100644 index 08f56346..00000000 --- a/mathics_scanner/prescanner.py +++ /dev/null @@ -1,206 +0,0 @@ -# -*- coding: utf-8 -*- - -# Note: this module will be remove or rewritten drastically in the near future. -""" -Module for "prescanning". Right now this just means replacing -character escape sequences. -""" - -from typing import List - -from mathics_scanner.characters import named_characters -from mathics_scanner.errors import IncompleteSyntaxError, ScanError -from mathics_scanner.feed import LineFeeder - - -class Prescanner(object): - r""" - A Class for converting escape sequences: - Character codes to characters: - \.7A -> z - \.004a -> J - \:004a -> J - \|01D451 -> \U0001D451 - \041 -> ! - Named Characters to Unicode: - \[Theta] -> \u03B8 - ASCII escape sequence: - \n -> literal \n - - Trailing backslash characters (\) are reported incomplete. - """ - - def __init__(self, feeder: LineFeeder): - # self.feeder is a function that returns the next line of the Mathics input - self.feeder = feeder - - # self.input_line is the result of reading the next Mathics input line - self.input_line: str = feeder.feed() - - # self.pos is current position within self.input_line. - self.pos = 0 - - def feed(self) -> str: - """ - Return the next line of Mathics input - """ - return self.feeder.feed() - - def get_more_input(self): - "Get another source-text line from input and continue." - - line: str = self.feed() - if not line: - text = self.input_line[self.pos :].rstrip() - self.feeder.message("Syntax", "sntxi", text) - raise IncompleteSyntaxError("Syntax", "sntxi", text) - self.input_line += line - - def replace_escape_sequences(self) -> str: - """ - Replace escape sequences in ``self.input_line``. The replacement string is returned. - Note: ``self.input_line`` is not modified. - """ - - # Line fragments to be joined before returning from this method. - line_fragments: List[str] = [] - - # Fragment start position of line fragment under consideration. - self.fragment_start = self.pos - - def start_new_fragment(pos: int) -> None: - """ - Update position markers to start a new line fragment at ``pos``. - """ - self.pos = pos - self.fragment_start = pos - - def try_parse_base(start_shift: int, end_shift: int, base: int) -> None: - r""" - See if characters self.pos+start_shift .. self.pos+end shift - can be converted to an integer in base ``base``. - - If so, we append the characters before the escape sequence without the - escaping characters like ``\.`` or ``\:``. - - We also append the converted integer to ``line_fragments``, and update - position cursors for a new line fragment. - - However, if the conversion fails, then error messages are - issued and nothing is updated - """ - start, end = self.pos + start_shift, self.pos + end_shift - result = None - if end <= len(self.input_line): - text = self.input_line[start:end] - try: - result = int(text, base) - except ValueError: - pass # result remains None - if result is None: - last = end - start - if last == 2: - self.feeder.message("Syntax", "sntoct2") - elif last == 3: - self.feeder.message("Syntax", "sntoct1") - elif last == 4: - self.feeder.message("Syntax", "snthex") - else: - raise ValueError() - self.feeder.message( - "Syntax", "sntxb", self.input_line[self.pos :].rstrip("\n") - ) - raise ScanError("Syntax", "sntxb") - - # Add text from prior line fragment as well - # as the escape sequence, a character, from the escape sequence - # that was just matched. - line_fragments.append(self.input_line[start : self.pos]) - line_fragments.append(chr(result)) - - # Set up a new line fragment for the next time we are called. - start_new_fragment(end) - - def try_parse_named_character(start_shift: int): - r"""Before calling we have matched "\[". Scan to the remaining "]" and - try to match what is found in-between with a known named - character, e.g. "Theta". If we can match this, we store - the unicode character equivalent in ``line_fragments``. - If we can't find a named character, error messages are - issued and we leave ``line_fragments`` untouched. - """ - i = self.pos + start_shift - while True: - if i == len(self.input_line): - self.get_more_input() - if self.input_line[i] == "]": - break - i += 1 - - named_character = self.input_line[self.pos + start_shift : i] - if named_character.isalpha(): - char = named_characters.get(named_character) - if char is None: - self.feeder.message("Syntax", "sntufn", named_character) - # stay in same line fragment - else: - # Add text from prior line fragment as well - # as the escape sequence, a character, from the escape sequence - # just matched. - line_fragments.append( - self.input_line[self.fragment_start : self.pos] - ) - line_fragments.append(char) - start_new_fragment(i + 1) - - # Stay in same line fragment, but advance the cursor position. - self.pos = i + 1 - - # In the following loop, we look for and replace escape - # sequences. The current character under consideration is at - # self.code[self.pos]. When an escape sequence is found at - # that position, the previous line_fragment is extracted and - # stored in ``line_fragments``. The start-position marker for the - # next line_fragment is started and self.pos is updated. - - while self.pos < len(self.input_line): - if self.input_line[self.pos] == "\\": - # Look for and handle an escape sequence. - if self.pos + 1 == len(self.input_line): - self.get_more_input() - c = self.input_line[self.pos + 1] - if c == "|": - try_parse_base(2, 8, 16) - if c == ".": - # See if we have a two-digit hexadecimal number. - try_parse_base(2, 4, 16) - elif c == ":": - # See if we have a four-digit hexadecimal number. - try_parse_base(2, 6, 16) - elif c == "[": - try_parse_named_character(2) - elif c in "01234567": - # See if we have an octal number. - try_parse_base(1, 4, 8) - elif c == "\n": - if self.pos + 2 == len(self.input_line): - self.get_more_input() - line_fragments.append( - self.input_line[self.fragment_start : self.pos] - ) - start_new_fragment(self.pos + 2) - else: - # Two backslashes in succession indicates a single - # backslash character. Advance the scanning - # cursor (self.pos) over both backslashes. Also, - # Python's backslash escape mechanism turns the - # two backslashes into one in length calculations. - self.pos += 2 - else: - self.pos += 1 - - # Add the final line fragment. - line_fragments.append(self.input_line[self.fragment_start :]) - - # Produce and return the input line with escape-sequences replaced - return "".join(line_fragments) diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 06ade1fc..d1bf8f72 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -517,6 +517,8 @@ def __init__(self, feeder): ) self.pos: int = 0 self.feeder = feeder + + # FIXME: remove this self.prescanner = Prescanner(feeder) self.source_text = self.prescanner.replace_escape_sequences() self.mode: str = "invalid" @@ -613,6 +615,56 @@ def next(self) -> Token: self.pos = pattern_match.end(0) return Token(tag, text, pattern_match.start(0)) + def try_parse_base(self, start_shift: int, end_shift: int, base: int) -> str: + r""" + See if characters self.pos+start_shift .. self.pos+end shift + can be converted to an integer in base ``base``. + + If so, chr(integer value converted from base). + + However, if the conversion fails, then error messages are + issued and nothing is updated + """ + start, end = self.pos + start_shift, self.pos + end_shift + result = None + if end <= len(self.source_text): + text = self.source_text[start:end] + try: + result = int(text, base) + except ValueError: + pass # result remains None + if result is None: + last = end - start + if last == 2: + self.feeder.message("Syntax", "sntoct2") + elif last == 3: + self.feeder.message("Syntax", "sntoct1") + elif last == 4: + self.feeder.message("Syntax", "snthex") + else: + raise ValueError() + error_text = self.source_text[self.pos :].rstrip("\n") + self.feeder.message("Syntax", "sntxb", error_text) + raise ScanError("syntx", error_text) + + return chr(result) + + def try_parse_named_character(self, start_shift: int) -> Optional[str]: + r"""Before calling we have matched "\[". Scan to the remaining "]" and + try to match what is found in-between with a known named + character, e.g. "Theta". If we can match this, we store + the unicode character equivalent in ``line_fragments``. + If we can't find a named character, error messages are + issued and we leave ``line_fragments`` untouched. + """ + named_character = self.source_text[self.pos + start_shift : self.pos + start_shift] + if named_character.isalpha(): + char = named_characters.get(named_character) + if char is None: + self.feeder.message("Syntax", "sntufn", named_character) + else: + return named_character + def _skip_blank(self): "Skip whitespace and comments" comment = [] # start positions of comments @@ -703,23 +755,85 @@ def t_String(self, _: re.Match) -> Token: start, end = self.pos, None self.pos += 1 # skip opening '"' newlines = [] + source_text = self.source_text + result = "" while True: if self.pos >= len(self.source_text): if end is None: # reached end while still inside string self.get_more_input() newlines.append(self.pos) + source_text = self.source_text else: break - char = self.source_text[self.pos] + char = source_text[self.pos] if char == '"': self.pos += 1 end = self.pos break if char == "\\": - self.pos += 2 + if self.pos + 1 == len(source_text): + # We have reached end of the input line before seeing a terminating + # quote ("). Fetch aanother line. + self.get_more_input() + self.pos += 1 + c = source_text[self.pos] + if c == "\\": + result += "\\" + self.pos += 1 + continue + # https://www.wolfram.com/language/12/networking-and-system-operations/use-the-full-range-of-unicode-characters.html + # describes hex encoding. + elif c == ".": + # See if we have a 2-digit hexadecimal number. + # For example, \.42 is "B" + result += self.try_parse_base(1, 3, 16) + self.pos += 3 + elif c == ":": + # See if we have a 4-digit hexadecimal number. + # For example, \:03B8" is Unicode small leter theta: θ. + result += self.try_parse_base(1, 5, 16) + self.pos += 5 + elif c == "|": + # See if we have a 6-digit hexadecimal number. + result += self.try_parse_base(1, 7, 16) + elif c == "[": + named_character = self.try_parse_named_character(2) + if named_character is not None: + result += named_character + self.pos += 4 # ??? + elif c in "01234567": + # See if we have a 3-digit octal number. + # For example \065 = "5" + result += self.try_parse_base(0, 3, 8) + self.pos += 3 + + # WMA escape characters \n, \t, \b, \r. + # Note that these are a similer to Python, but are different. + # In particular, Python defines "\a" to be ^G (control G), + # but in WMA, this is invalid. + elif c in "ntbfr": + if c == "n": + result += "\n" + elif c == "t": + result += "\t" + elif c == "b": + result += "\b" + elif c == "f": + result += "\f" + else: + assert c == "r" + result += "\r" + self.pos += 1 + elif c in '!"': + result += c + self.pos += 1 + else: + self.sntx_invalid_esc_message(c) + raise ScanError() else: + result += self.source_text[self.pos] self.pos += 1 indices = [start] + newlines + [end] diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py index 0cbd5b87..3715f41f 100644 --- a/test/test_string_tokens.py +++ b/test/test_string_tokens.py @@ -3,6 +3,8 @@ Tests translation from text characters to the token: String """ +from typing import Optional + import pytest from mathics_scanner.errors import IncompleteSyntaxError, ScanError @@ -10,11 +12,14 @@ from mathics_scanner.tokeniser import Token, Tokeniser -def check_string(source_text, expected_text: str): +def check_string(source_text, expected_text: str, message: Optional[str] = ""): token = single_token(source_text) assert token is not None assert token.tag == "String" - assert token.text == expected_text + if message: + assert token.text == expected_text, message + else: + assert token.text == expected_text def incomplete_error(s: str, failure_msg: str): @@ -51,17 +56,27 @@ def get_tokens(source_text: str): def test_string(): + # Number conversions for binary, octal, hexadecimal + check_string(r'"\\c"', '"\\c"', "escaped backslash at beginning of string") + check_string(r'"a\\b"', r'"a\b"', "escaped backslash") + check_string(r'"\:03B8"', '"θ"', "4-digit hexadecimal number test") + check_string(r'"\102"', '"B"', "Octal number test") + check_string(r'"\.42"', '"B"', "2-digit hexadecimal number test") + check_string(r'"q\.b4"', '"q´"') + + # All valid ASCII-like control escape sequences for escape_string in ("\b", "\f", "\n", "\r", "\t"): check_string(f'"a{escape_string}"', f'"a{escape_string}"') - # Broken: - # "a\050", "a\051" "a\052" - # Prescanning eagerly replaces the escape sequences with - # symbols "(", ")", or "*" respectively and this messes up parsing - # somehow. + check_string(r'"a\050"', r'"a("', "Octal '(' in string") + check_string(r'"a\051"', r'"a)"', "Octal ')' in string") + check_string(r'"a\052"', r'"a*"', "Octal '*' in string") + check_string(r'"abc"', r'"abc"') check_string(r'"abc(*def*)"', r'"abc(*def*)"') - check_string(r'"a\"b\\c"', r'"a\"b\\c"') + # check_string(r'"a\"b\\c"', r'"a\\"b\c"') incomplete_error(r'"abc', "String does not have terminating quote") incomplete_error(r'"\"', "Unterminated escape sequence") - # scan_error(r'"a\X"', '"X" is not a valid escape character') + scan_error(r'"a\g"', "Unknown string escape \\g") + + scan_error(r'"a\X"', '"X" is not a valid escape character') From ae4aa63b32e016d5f6c694ea0ef05b98e76f4a5e Mon Sep 17 00:00:00 2001 From: rocky Date: Sat, 12 Apr 2025 13:51:51 -0400 Subject: [PATCH 02/34] Test workarounds.. for now. --- mathics_scanner/tokeniser.py | 1 + test/test_prescanner.py | 29 +++++++++--------------- test/test_string_tokens.py | 43 +++++++++++++++++++++++++++++++----- test/test_tokeniser.py | 1 + 4 files changed, 49 insertions(+), 25 deletions(-) diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index d1bf8f72..24406d35 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -798,6 +798,7 @@ def t_String(self, _: re.Match) -> Token: elif c == "|": # See if we have a 6-digit hexadecimal number. result += self.try_parse_base(1, 7, 16) + self.pos += 7 elif c == "[": named_character = self.try_parse_named_character(2) if named_character is not None: diff --git a/test/test_prescanner.py b/test/test_prescanner.py index 2f153d09..25163d26 100644 --- a/test/test_prescanner.py +++ b/test/test_prescanner.py @@ -2,8 +2,8 @@ import pytest from mathics_scanner import IncompleteSyntaxError, ScanError -from mathics_scanner.prescanner import Prescanner from mathics_scanner.feed import SingleLineFeeder +from mathics_scanner.prescanner import Prescanner def replace_escape_sequences(mathics_text: str): @@ -29,6 +29,7 @@ def assert_equal_length(mathics_text: str, length): assert len(replace_escape_sequences(mathics_text)) == length +@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner") def test_named_characters(): assert_equal(r"\[Theta]", "\u03B8") assert_equal(r"\[CapitalPi]", "\u03A0") @@ -40,6 +41,7 @@ def test_named_characters(): assert_equal("abc\\\\", "abc\\\\") +@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner") def test_text_lengths(): assert_equal_length(r'"\[Integral]"', 3) # Prescanner keep both slashes and quotes. @@ -48,34 +50,19 @@ def test_text_lengths(): assert_equal_length(r'"\\[Integral]"', 14) -def test_oct(): - assert_equal(r"\051", ")") - - -def test_hex_dot(): - assert_equal(r"\.30", "0") - - -def test_hex_colon(): - assert_equal(r"\:0030", "0") - assert_equal(r"\:03B8", "\u03B8") - assert_equal(r"\:03b8", "\u03B8") - - -def test_hex_vbar(): - assert_equal(r"\|01D451", "\U0001D451") - - +@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner") def test_incomplete(): assert_incomplete(r"\[") assert_incomplete(r"\[Theta") +@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner") def test_invalid_octal(): assert_invalid(r"\093") assert_invalid(r"\01") +@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner") def test_invalid_colon(): assert_invalid(r"\:") assert_invalid(r"\:A") @@ -88,18 +75,22 @@ def test_invalid_colon(): assert_invalid(r"\:01-2") +@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner") def test_invalid_dot(): assert_invalid(r"\.") assert_invalid(r"\.0") +@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner") def test_combined(): assert_equal(r"\:03B8\[Theta]\.30\052", "\u03B8\u03B80*") +@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner") def test_nested(): assert_equal(r"\[Thet\141]", r"\[Thet\141]") +@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner") def test_trailing_backslash(): assert_incomplete("x \\") diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py index 3715f41f..a6679823 100644 --- a/test/test_string_tokens.py +++ b/test/test_string_tokens.py @@ -59,19 +59,13 @@ def test_string(): # Number conversions for binary, octal, hexadecimal check_string(r'"\\c"', '"\\c"', "escaped backslash at beginning of string") check_string(r'"a\\b"', r'"a\b"', "escaped backslash") - check_string(r'"\:03B8"', '"θ"', "4-digit hexadecimal number test") check_string(r'"\102"', '"B"', "Octal number test") - check_string(r'"\.42"', '"B"', "2-digit hexadecimal number test") check_string(r'"q\.b4"', '"q´"') # All valid ASCII-like control escape sequences for escape_string in ("\b", "\f", "\n", "\r", "\t"): check_string(f'"a{escape_string}"', f'"a{escape_string}"') - check_string(r'"a\050"', r'"a("', "Octal '(' in string") - check_string(r'"a\051"', r'"a)"', "Octal ')' in string") - check_string(r'"a\052"', r'"a*"', "Octal '*' in string") - check_string(r'"abc"', r'"abc"') check_string(r'"abc(*def*)"', r'"abc(*def*)"') # check_string(r'"a\"b\\c"', r'"a\\"b\c"') @@ -80,3 +74,40 @@ def test_string(): scan_error(r'"a\g"', "Unknown string escape \\g") scan_error(r'"a\X"', '"X" is not a valid escape character') + + +# https://www.wolfram.com/language/12/networking-and-system-operations/use-the-full-range-of-unicode-characters.html +# describes hex encoding. + + +def test_octal(): + check_string(r'"a\050"', r'"a("', "Octal '(' in string") + check_string(r'"a\051"', r'"a)"', "Octal ')' in string") + check_string(r'"a\052"', r'"a*"', "Octal '*' in string") + # FIXME: add tests ouside of string + + +def test_hexadecimal_dot(): + check_string(r'"\.30"', '"0"', "2-digit hexadecimal ASCII number 0") + check_string(r'"\.42"', '"B"', "2-digit hexadecimal ASCII capital B") + # FIXME: add tests ouside of string + + +def test_hexadecimal_colon(): + check_string( + r'"\:03B8"', + '"θ"', + "4-digit hexadecimal number test with uppercase alpha letter", + ) + check_string( + r'"\:03b8"', + '"\u03B8"', + "4-digit hexadecimal number test with lowercase alpha lettter", + ) + check_string(r'"\:0030"', '"0"') + # FIXME: + # check_string(r"\:03b8", "\u03B8", "4-digit hexadecimal number test with lowercase alpha lettter") + + +def test_hexadecimal_vbar(): + check_string(r'"\|01D451"', '"\U0001D451"') diff --git a/test/test_tokeniser.py b/test/test_tokeniser.py index 6e336c00..96ceee8d 100644 --- a/test/test_tokeniser.py +++ b/test/test_tokeniser.py @@ -90,6 +90,7 @@ def test_association(): ] +@pytest.mark.skip("Backslash needs to be hanndled outside of prescanner") def test_backslash(): assert tokens("\\[Backslash]") == [Token("Backslash", "\u2216", 0)] From 12482554d810ebbaaaedda2720fb99cb59b0d0f2 Mon Sep 17 00:00:00 2001 From: rocky Date: Sat, 12 Apr 2025 14:54:40 -0400 Subject: [PATCH 03/34] Isolate tokenizing escape sequences --- mathics_scanner/tokeniser.py | 61 +++--------------------------------- 1 file changed, 5 insertions(+), 56 deletions(-) diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 24406d35..3a597b93 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -10,7 +10,7 @@ import string from typing import Dict, List, Optional, Tuple -from mathics_scanner.characters import _letterlikes, _letters +from mathics_scanner.characters import _letterlikes, _letters, named_characters from mathics_scanner.errors import IncompleteSyntaxError, ScanError from mathics_scanner.prescanner import Prescanner @@ -778,61 +778,10 @@ def t_String(self, _: re.Match) -> Token: # quote ("). Fetch aanother line. self.get_more_input() self.pos += 1 - c = source_text[self.pos] - if c == "\\": - result += "\\" - self.pos += 1 - continue - # https://www.wolfram.com/language/12/networking-and-system-operations/use-the-full-range-of-unicode-characters.html - # describes hex encoding. - elif c == ".": - # See if we have a 2-digit hexadecimal number. - # For example, \.42 is "B" - result += self.try_parse_base(1, 3, 16) - self.pos += 3 - elif c == ":": - # See if we have a 4-digit hexadecimal number. - # For example, \:03B8" is Unicode small leter theta: θ. - result += self.try_parse_base(1, 5, 16) - self.pos += 5 - elif c == "|": - # See if we have a 6-digit hexadecimal number. - result += self.try_parse_base(1, 7, 16) - self.pos += 7 - elif c == "[": - named_character = self.try_parse_named_character(2) - if named_character is not None: - result += named_character - self.pos += 4 # ??? - elif c in "01234567": - # See if we have a 3-digit octal number. - # For example \065 = "5" - result += self.try_parse_base(0, 3, 8) - self.pos += 3 - - # WMA escape characters \n, \t, \b, \r. - # Note that these are a similer to Python, but are different. - # In particular, Python defines "\a" to be ^G (control G), - # but in WMA, this is invalid. - elif c in "ntbfr": - if c == "n": - result += "\n" - elif c == "t": - result += "\t" - elif c == "b": - result += "\b" - elif c == "f": - result += "\f" - else: - assert c == "r" - result += "\r" - self.pos += 1 - elif c in '!"': - result += c - self.pos += 1 - else: - self.sntx_invalid_esc_message(c) - raise ScanError() + escape_str, self.pos = self.prescanner.tokenize_escape_sequence( + source_text, self.pos + ) + result += escape_str else: result += self.source_text[self.pos] self.pos += 1 From 95bd1052792685e2d3ed4c5c743752e5a3cbf817 Mon Sep 17 00:00:00 2001 From: rocky Date: Sun, 13 Apr 2025 13:06:19 -0400 Subject: [PATCH 04/34] Split out escape_sequence parsing. --- mathics_scanner/escape_sequences.py | 126 ++++++++++++++++++++++++++++ mathics_scanner/tokeniser.py | 9 +- test/test_escape_sequences.py | 32 +++++++ test/test_string_tokens.py | 42 +++++----- 4 files changed, 182 insertions(+), 27 deletions(-) create mode 100644 mathics_scanner/escape_sequences.py create mode 100644 test/test_escape_sequences.py diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py new file mode 100644 index 00000000..f30deae8 --- /dev/null +++ b/mathics_scanner/escape_sequences.py @@ -0,0 +1,126 @@ +""" +Helper Module for tokenizing character escape sequences. +""" + +from typing import Optional, Tuple + +from mathics_scanner.characters import named_characters +from mathics_scanner.errors import ( + EscapeSyntaxError, + NamedCharacterSyntaxError, + ScanError, +) + + +def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) -> str: + r""" + See if characters start_shift .. end shift + can be converted to an integer in base ``base``. + + If so, chr(integer value converted from base). + + However, if the conversion fails, then error messages are + issued and nothing is updated + """ + start, end = start_shift, end_shift + result = None + if end <= len(source_text): + text = source_text[start:end] + try: + result = int(text, base) + except ValueError: + pass # result remains None + if result is None: + last = end - start + if last == 2: + tag = "sntoct2" + elif last == 3: + tag = "sntoct1" + elif last == 4: + tag = "snthex" + else: + raise ValueError() + raise ScanError(tag, source_text[start_shift:].rstrip("\n")) + + return chr(result) + + +def parse_named_character( + source_text: str, pos: int, start_shift: int +) -> Optional[str]: + r"""Before calling we have matched "\[". Scan to the remaining "]" and + try to match what is found in-between with a known named + character, e.g. "Theta". If we can match this, we store + the unicode character equivalent in ``line_fragments``. + If we can't find a named character, error messages are + issued and we leave ``line_fragments`` untouched. + """ + named_character = source_text[pos + start_shift : pos + start_shift] + if named_character.isalpha(): + char = named_characters.get(named_character) + if char is None: + raise NamedCharacterSyntaxError("sntufn", named_character) + else: + return named_character + + +def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: + """ + Given some source text `source_text` at position `pos`, return the escape sequence and the + follow-on position. + """ + result = "" + c = source_text[pos] + if c == "\\": + return "\\", pos + 1 + + # https://www.wolfram.com/language/12/networking-and-system-operations/use-the-full-range-of-unicode-characters.html + # describes hex encoding. + if c == ".": + # See if we have a 2-digit hexadecimal number. + # For example, \.42 is "B" + result += parse_base(source_text, pos + 1, pos + 3, 16) + pos += 3 + elif c == ":": + # See if we have a 4-digit hexadecimal number. + # For example, \:03B8" is Unicode small leter theta: θ. + result += parse_base(source_text, pos + 1, pos + 5, 16) + pos += 5 + elif c == "|": + # See if we have a 6-digit hexadecimal number. + result += parse_base(source_text, pos + 1, pos + 7, 16) + pos += 7 + elif c == "[": + named_character = parse_named_character(source_text, pos, 2) + if named_character is not None: + result += named_character + pos += 4 # ??? + elif c in "01234567": + # See if we have a 3-digit octal number. + # For example \065 = "5" + result += parse_base(source_text, pos, pos + 3, 8) + pos += 3 + + # WMA escape characters \n, \t, \b, \r. + # Note that these are a similer to Python, but are different. + # In particular, Python defines "\a" to be ^G (control G), + # but in WMA, this is invalid. + elif c in "ntbfr": + if c == "n": + result += "\n" + elif c == "t": + result += "\t" + elif c == "b": + result += "\b" + elif c == "f": + result += "\f" + else: + assert c == "r" + result += "\r" + pos += 1 + elif c in '!"': + result += c + pos += 1 + else: + raise EscapeSyntaxError("Syntax", "stresc" rf"\{c}.") + return result, pos diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 3a597b93..62ea3dec 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -12,7 +12,6 @@ from mathics_scanner.characters import _letterlikes, _letters, named_characters from mathics_scanner.errors import IncompleteSyntaxError, ScanError -from mathics_scanner.prescanner import Prescanner try: import ujson @@ -568,8 +567,8 @@ def sntx_message(self, pos: Optional[int] = None) -> Tuple[str, str, str]: pos = self.pos pre, post = self.source_text[:pos], self.source_text[pos:].rstrip("\n") if pos == 0: - self.feeder.message("Syntax", "sntxb", post) - return "sntxb", "", post + self.feeder.message("Syntax", "sntxb", pre, post) + return "sntxb", pre, post else: self.feeder.message("Syntax", "sntxf", pre, post) return "sntxf", pre, post @@ -778,9 +777,7 @@ def t_String(self, _: re.Match) -> Token: # quote ("). Fetch aanother line. self.get_more_input() self.pos += 1 - escape_str, self.pos = self.prescanner.tokenize_escape_sequence( - source_text, self.pos - ) + escape_str, self.pos = parse_escape_sequence(source_text, self.pos) result += escape_str else: result += self.source_text[self.pos] diff --git a/test/test_escape_sequences.py b/test/test_escape_sequences.py new file mode 100644 index 00000000..889e46c2 --- /dev/null +++ b/test/test_escape_sequences.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +from mathics_scanner.escape_sequences import parse_escape_sequence + + +def test_escape_sequences(): + for text, pos, expect_pos, expect_str, fail_msg in ( + # Backslash + ("\\\\", 0, 1, "\\", "backslash"), + ("abc \\\\", 5, 6, "\\", "backslash at end"), + ("abc \\\\n", 5, 6, "\\", "backslash in middle"), + # Octal + (r"051", 0, 3, chr(0o51), "character at beginning"), + (r"a\051", 2, 5, chr(0o51), "Octal character in middle"), + # With dot + (r".30", 0, 3, chr(0x30), "two-character hex"), + ( + r"a\.3015", + 2, + 5, + chr(0x30), + "two-character hex in middle with trailing digits", + ), + (r"b\.4dXYZ", 2, 5, chr(0x4D), "two-character hex in middle"), + # With colon + (r":0030", 0, 5, "0", "four-character hex"), + (r":03B8", 0, 5, "\u03B8", "four-character hex unicode uppercase"), + (r":03B8", 0, 5, "\u03b8", "four-character hex unicode lowercase"), + # With Vertical bar + (r"|01d451", 0, 7, "\U0001D451", "six-character hex unicode lowercase"), + (r"|01D451", 0, 7, "\U0001D451", "six-character hex unicode uppercase"), + ): + assert parse_escape_sequence(text, pos) == (expect_str, expect_pos), fail_msg diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py index a6679823..e42a5bee 100644 --- a/test/test_string_tokens.py +++ b/test/test_string_tokens.py @@ -7,7 +7,7 @@ import pytest -from mathics_scanner.errors import IncompleteSyntaxError, ScanError +from mathics_scanner.errors import EscapeSyntaxError, IncompleteSyntaxError from mathics_scanner.feed import SingleLineFeeder from mathics_scanner.tokeniser import Token, Tokeniser @@ -29,8 +29,8 @@ def incomplete_error(s: str, failure_msg: str): assert excinfo, failure_msg -def scan_error(s: str, failure_msg: str): - with pytest.raises(ScanError) as excinfo: +def escape_scan_error(s: str, failure_msg: str): + with pytest.raises(EscapeSyntaxError) as excinfo: get_tokens(s) assert excinfo, failure_msg @@ -56,24 +56,24 @@ def get_tokens(source_text: str): def test_string(): - # Number conversions for binary, octal, hexadecimal - check_string(r'"\\c"', '"\\c"', "escaped backslash at beginning of string") - check_string(r'"a\\b"', r'"a\b"', "escaped backslash") - check_string(r'"\102"', '"B"', "Octal number test") - check_string(r'"q\.b4"', '"q´"') - - # All valid ASCII-like control escape sequences - for escape_string in ("\b", "\f", "\n", "\r", "\t"): - check_string(f'"a{escape_string}"', f'"a{escape_string}"') - - check_string(r'"abc"', r'"abc"') - check_string(r'"abc(*def*)"', r'"abc(*def*)"') - # check_string(r'"a\"b\\c"', r'"a\\"b\c"') - incomplete_error(r'"abc', "String does not have terminating quote") - incomplete_error(r'"\"', "Unterminated escape sequence") - scan_error(r'"a\g"', "Unknown string escape \\g") - - scan_error(r'"a\X"', '"X" is not a valid escape character') + # # Number conversions for binary, octal, hexadecimal + # check_string(r'"\\c"', '"\\c"', "escaped backslash at beginning of string") + # check_string(r'"a\\b"', r'"a\b"', "escaped backslash") + # check_string(r'"\102"', '"B"', "Octal number test") + # check_string(r'"q\.b4"', '"q´"') + + # # All valid ASCII-like control escape sequences + # for escape_string in ("\b", "\f", "\n", "\r", "\t"): + # check_string(f'"a{escape_string}"', f'"a{escape_string}"') + + # check_string(r'"abc"', r'"abc"') + # check_string(r'"abc(*def*)"', r'"abc(*def*)"') + # # check_string(r'"a\"b\\c"', r'"a\\"b\c"') + # incomplete_error(r'"abc', "String does not have terminating quote") + # incomplete_error(r'"\"', "Unterminated escape sequence") + escape_scan_error(r'"a\g"', "Unknown string escape \\g") + + escape_scan_error(r'"a\X"', '"X" is not a valid escape character') # https://www.wolfram.com/language/12/networking-and-system-operations/use-the-full-range-of-unicode-characters.html From f1a06e15bf4c10cf76ddbbf31f85f730ecbee87a Mon Sep 17 00:00:00 2001 From: rocky Date: Mon, 14 Apr 2025 10:57:09 -0400 Subject: [PATCH 05/34] Handle escape sequences outside of strings. --- mathics_scanner/escape_sequences.py | 33 +++++++----- mathics_scanner/tokeniser.py | 78 ++--------------------------- test/test_escape_sequences.py | 21 +++++--- 3 files changed, 39 insertions(+), 93 deletions(-) diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py index f30deae8..35f6ab38 100644 --- a/mathics_scanner/escape_sequences.py +++ b/mathics_scanner/escape_sequences.py @@ -45,9 +45,7 @@ def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) -> return chr(result) -def parse_named_character( - source_text: str, pos: int, start_shift: int -) -> Optional[str]: +def parse_named_character(source_text: str, start: int, finish: int) -> Optional[str]: r"""Before calling we have matched "\[". Scan to the remaining "]" and try to match what is found in-between with a known named character, e.g. "Theta". If we can match this, we store @@ -55,13 +53,13 @@ def parse_named_character( If we can't find a named character, error messages are issued and we leave ``line_fragments`` untouched. """ - named_character = source_text[pos + start_shift : pos + start_shift] + named_character = source_text[start:finish] if named_character.isalpha(): char = named_characters.get(named_character) if char is None: raise NamedCharacterSyntaxError("sntufn", named_character) else: - return named_character + return char def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: @@ -77,24 +75,35 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: # https://www.wolfram.com/language/12/networking-and-system-operations/use-the-full-range-of-unicode-characters.html # describes hex encoding. if c == ".": - # See if we have a 2-digit hexadecimal number. - # For example, \.42 is "B" + # see if we have a 2-digit hexadecimal number. + # for example, \.42 is "b" result += parse_base(source_text, pos + 1, pos + 3, 16) pos += 3 elif c == ":": - # See if we have a 4-digit hexadecimal number. - # For example, \:03B8" is Unicode small leter theta: θ. + # see if we have a 4-digit hexadecimal number. + # for example, \:03b8" is unicode small leter theta: θ. result += parse_base(source_text, pos + 1, pos + 5, 16) pos += 5 elif c == "|": - # See if we have a 6-digit hexadecimal number. + # see if we have a 6-digit hexadecimal number. result += parse_base(source_text, pos + 1, pos + 7, 16) pos += 7 elif c == "[": - named_character = parse_named_character(source_text, pos, 2) + pos += 1 + i = pos + 1 + while i < len(source_text): + if source_text[i] == "]": + break + i += 1 + if i == len(source_text): + # Note: named characters do not have \n's in them. (Is this right)? + # FIXME: decide what to do here. + raise EscapeSyntaxError("Syntax", "stresc" rf"\{c}.") + + named_character = parse_named_character(source_text, pos, i) if named_character is not None: result += named_character - pos += 4 # ??? + pos = i + 1 elif c in "01234567": # See if we have a 3-digit octal number. # For example \065 = "5" diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 62ea3dec..977f97a2 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -10,8 +10,9 @@ import string from typing import Dict, List, Optional, Tuple -from mathics_scanner.characters import _letterlikes, _letters, named_characters +from mathics_scanner.characters import _letterlikes, _letters from mathics_scanner.errors import IncompleteSyntaxError, ScanError +from mathics_scanner.escape_sequences import parse_escape_sequence try: import ujson @@ -516,10 +517,9 @@ def __init__(self, feeder): ) self.pos: int = 0 self.feeder = feeder + self.source_text = "" - # FIXME: remove this - self.prescanner = Prescanner(feeder) - self.source_text = self.prescanner.replace_escape_sequences() + # FIXME: remove this. self.mode: str = "invalid" # Set to True when inside box parsing. @@ -608,85 +608,17 @@ def next(self) -> Token: if override is not None: return override(pattern_match) - # Failing a custom tokenization rule, we use the regular expression - # pattern match. text = pattern_match.group(0) self.pos = pattern_match.end(0) return Token(tag, text, pattern_match.start(0)) - def try_parse_base(self, start_shift: int, end_shift: int, base: int) -> str: - r""" - See if characters self.pos+start_shift .. self.pos+end shift - can be converted to an integer in base ``base``. - - If so, chr(integer value converted from base). - - However, if the conversion fails, then error messages are - issued and nothing is updated - """ - start, end = self.pos + start_shift, self.pos + end_shift - result = None - if end <= len(self.source_text): - text = self.source_text[start:end] - try: - result = int(text, base) - except ValueError: - pass # result remains None - if result is None: - last = end - start - if last == 2: - self.feeder.message("Syntax", "sntoct2") - elif last == 3: - self.feeder.message("Syntax", "sntoct1") - elif last == 4: - self.feeder.message("Syntax", "snthex") - else: - raise ValueError() - error_text = self.source_text[self.pos :].rstrip("\n") - self.feeder.message("Syntax", "sntxb", error_text) - raise ScanError("syntx", error_text) - - return chr(result) - - def try_parse_named_character(self, start_shift: int) -> Optional[str]: - r"""Before calling we have matched "\[". Scan to the remaining "]" and - try to match what is found in-between with a known named - character, e.g. "Theta". If we can match this, we store - the unicode character equivalent in ``line_fragments``. - If we can't find a named character, error messages are - issued and we leave ``line_fragments`` untouched. - """ - named_character = self.source_text[self.pos + start_shift : self.pos + start_shift] - if named_character.isalpha(): - char = named_characters.get(named_character) - if char is None: - self.feeder.message("Syntax", "sntufn", named_character) - else: - return named_character - def _skip_blank(self): "Skip whitespace and comments" comment = [] # start positions of comments while True: if self.pos >= len(self.source_text): if comment: - try: - self.get_more_input() - except ValueError: - # `get_more_input` tries to parse substrings like `\|AAAAA` - # that can be interpreted as a character reference. - # To do that, it tries to get the - # new line using the method - # `Prescanner.replace_escape_sequences()` - # Inside a comment, the special meaning of escape sequences - # like `\|` should not be taken into account. - # - # In case of error, just let's pick the code - # from the `input_line` attribute of - # prescanner: - self.source_text = self.prescanner.input_line - # TODO: handle the corner case where the rest of the line - # include escaped sequences, out of the comment. + self.get_more_input() else: break if comment: diff --git a/test/test_escape_sequences.py b/test/test_escape_sequences.py index 889e46c2..86ef583b 100644 --- a/test/test_escape_sequences.py +++ b/test/test_escape_sequences.py @@ -9,24 +9,29 @@ def test_escape_sequences(): ("abc \\\\", 5, 6, "\\", "backslash at end"), ("abc \\\\n", 5, 6, "\\", "backslash in middle"), # Octal - (r"051", 0, 3, chr(0o51), "character at beginning"), + (r"050", 0, 3, chr(0o50), "character at beginning"), (r"a\051", 2, 5, chr(0o51), "Octal character in middle"), - # With dot + # With dot (2-character hex) (r".30", 0, 3, chr(0x30), "two-character hex"), ( - r"a\.3015", + r"a\.3115", 2, 5, - chr(0x30), + chr(0x31), "two-character hex in middle with trailing digits", ), (r"b\.4dXYZ", 2, 5, chr(0x4D), "two-character hex in middle"), - # With colon + # With colon (4-character hex) (r":0030", 0, 5, "0", "four-character hex"), - (r":03B8", 0, 5, "\u03B8", "four-character hex unicode uppercase"), + (r":03B5", 0, 5, "\u03B5", "four-character hex unicode uppercase"), (r":03B8", 0, 5, "\u03b8", "four-character hex unicode lowercase"), - # With Vertical bar - (r"|01d451", 0, 7, "\U0001D451", "six-character hex unicode lowercase"), + # With Vertical bar (6-character hex) + (r"|01d450", 0, 7, "\U0001D450", "six-character hex unicode lowercase"), (r"|01D451", 0, 7, "\U0001D451", "six-character hex unicode uppercase"), + # Named Characters + ("[Theta]", 0, 7, "\u03B8", "Named character; full string"), + ("abcd[CapitalPi]efg", 4, 15, "\u03A0", "Named character; internal"), + (r"z \[Conjugate]", 3, 14, "\uF3C8", "Named character; at end"), + ("[Integral]", 0, 10, "\u222b", "Another full-string named-character"), ): assert parse_escape_sequence(text, pos) == (expect_str, expect_pos), fail_msg From f6846a2d22bccb109d2c693d3d08a29975320f87 Mon Sep 17 00:00:00 2001 From: rocky Date: Mon, 14 Apr 2025 11:25:02 -0400 Subject: [PATCH 06/34] Remove prescanner and .. handle syntax errors in mathics3-tokens. --- mathics_scanner/escape_sequences.py | 4 ++-- mathics_scanner/tokeniser.py | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py index 35f6ab38..8f23b2b6 100644 --- a/mathics_scanner/escape_sequences.py +++ b/mathics_scanner/escape_sequences.py @@ -114,8 +114,8 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: # Note that these are a similer to Python, but are different. # In particular, Python defines "\a" to be ^G (control G), # but in WMA, this is invalid. - elif c in "ntbfr": - if c == "n": + elif c in "ntbfr\n": + if c in "n\n": result += "\n" elif c == "t": result += "\t" diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 977f97a2..3f442556 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -517,9 +517,7 @@ def __init__(self, feeder): ) self.pos: int = 0 self.feeder = feeder - self.source_text = "" - - # FIXME: remove this. + self.source_text = self.feeder.feed() self.mode: str = "invalid" # Set to True when inside box parsing. @@ -579,7 +577,7 @@ def next(self) -> Token: "Returns the next token from self.source_text." self._skip_blank() source_text = self.source_text - if self.pos >= len(self.source_text): + if self.pos >= len(source_text): return Token("END", "", len(source_text)) # Look for a matching pattern. From ccfe94357320fd22672002d0ca59e98edf662ace Mon Sep 17 00:00:00 2001 From: rocky Date: Mon, 14 Apr 2025 11:57:51 -0400 Subject: [PATCH 07/34] Rename some variables Tokenizer.code -> Tokenizer.source_text Tokenizer.incomplete -> Tokenizer.get_more_input --- mathics_scanner/tokeniser.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 3f442556..52efc21b 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -453,7 +453,7 @@ def is_symbol_name(text: str) -> bool: class Token: """A representation of a Wolfram-Language token. - Tokens are parsed by parser uses to build M-expressions. + Tokens are parsed by the parser; and are used to build M-expressions. A token has a `tag`, the class or type of the token. For example: a Number, Symbol, String, File, etc. @@ -466,11 +466,6 @@ class Token: """ def __init__(self, tag: str, text: str, pos: int): - """ - :param tag: which type of token this is. - :param text: The actual contents of the token. - :param pos: The position of the token in the input feed. - """ self.tag = tag self.text = text self.pos = pos @@ -518,6 +513,7 @@ def __init__(self, feeder): self.pos: int = 0 self.feeder = feeder self.source_text = self.feeder.feed() + self.mode: str = "invalid" # Set to True when inside box parsing. From 3d0a2f72ef38d55eee9edde159ff42c8dfdf0cf5 Mon Sep 17 00:00:00 2001 From: rocky Date: Mon, 14 Apr 2025 13:00:58 -0400 Subject: [PATCH 08/34] Bang more on mathics3-tokens Start to show syntax errors. --- mathics_scanner/mathics3_tokens.py | 1 + mathics_scanner/tokeniser.py | 39 ++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/mathics_scanner/mathics3_tokens.py b/mathics_scanner/mathics3_tokens.py index db6a163c..8f06db74 100644 --- a/mathics_scanner/mathics3_tokens.py +++ b/mathics_scanner/mathics3_tokens.py @@ -181,6 +181,7 @@ def interactive_eval_loop(shell: TerminalShell, code_tokenize_format: bool): "sntufn", "Unknown unicode longname", ) + except KeyboardInterrupt: print("\nKeyboardInterrupt. Type Ctrl-D (EOF) to exit.") except EOFError: diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 52efc21b..8f8a0e7a 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -602,6 +602,8 @@ def next(self) -> Token: if override is not None: return override(pattern_match) + # Failing a custom tokenization rule, we use the regular expression + # pattern match. text = pattern_match.group(0) self.pos = pattern_match.end(0) return Token(tag, text, pattern_match.start(0)) @@ -672,6 +674,43 @@ def t_PutAppend(self, pattern_match: re.Match) -> Token: "Scan for a ``PutAppend`` token and return that" return self._token_mode(pattern_match, "PutAppend", "filename") + def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: + """Break out from ``pattern_match`` tokens which start with \\""" + source_text = self.source_text + start_pos = self.pos + 1 + if start_pos == len(source_text): + # We have reached end of the input line before seeing a terminating + # quote ("). Fetch another line. + self.get_more_input() + self.pos += 1 + source_text += self.source_text + escape_str, self.pos = parse_escape_sequence(source_text, start_pos) + + # DRY with "next()" + # look for a matching pattern + indices = self.token_indices.get(escape_str[0], ()) + pattern_match = None + tag = "??invalid" + if indices: + for index in indices: + tag, pattern = self.tokens[index] + pattern_match = pattern.match(escape_str, 0) + if pattern_match is not None: + break + else: + for tag, pattern in self.tokens: + pattern_match = pattern.match(escape_str, start_pos) + if pattern_match is not None: + break + + # no matching pattern found + if pattern_match is None: + tag, pre, post = self.sntx_message() + raise ScanError(tag, pre, post) + + text = pattern_match.group(0) + return Token(tag, text, pattern_match.start(0)) + def t_String(self, _: re.Match) -> Token: """Break out from self.source_text the next token which is expected to be a String. The string value of the returned token will have double quote (") in the first and last From 1c03e8bbd84506dd73c5b252a2c96950bfc4cc61 Mon Sep 17 00:00:00 2001 From: rocky Date: Tue, 15 Apr 2025 11:10:18 -0400 Subject: [PATCH 09/34] Start going over error messages... In particular errors with octal digits and incomplete named errors. Go over docstrings in escape_sequences.py --- mathics_scanner/escape_sequences.py | 56 +++++++++++++++-------------- mathics_scanner/mathics3_tokens.py | 1 - mathics_scanner/tokeniser.py | 6 +++- test/test_escape_sequences.py | 9 +++++ 4 files changed, 44 insertions(+), 28 deletions(-) diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py index 8f23b2b6..913acaf0 100644 --- a/mathics_scanner/escape_sequences.py +++ b/mathics_scanner/escape_sequences.py @@ -17,24 +17,20 @@ def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) -> See if characters start_shift .. end shift can be converted to an integer in base ``base``. - If so, chr(integer value converted from base). + If so, chr(integer value converted from base) is returnd. - However, if the conversion fails, then error messages are - issued and nothing is updated + However, if the conversion fails, ScanError is raised. """ - start, end = start_shift, end_shift - result = None - if end <= len(source_text): - text = source_text[start:end] - try: - result = int(text, base) - except ValueError: - pass # result remains None - if result is None: - last = end - start + assert start_shift <= end_shift <= len(source_text) + text = source_text[start_shift:end_shift] + try: + result = int(text, base) + except ValueError: + last = start_shift - end_shift if last == 2: tag = "sntoct2" elif last == 3: + assert base == 8, "Only octal requires 3 digits" tag = "sntoct1" elif last == 4: tag = "snthex" @@ -46,12 +42,18 @@ def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) -> def parse_named_character(source_text: str, start: int, finish: int) -> Optional[str]: - r"""Before calling we have matched "\[". Scan to the remaining "]" and - try to match what is found in-between with a known named - character, e.g. "Theta". If we can match this, we store - the unicode character equivalent in ``line_fragments``. - If we can't find a named character, error messages are - issued and we leave ``line_fragments`` untouched. + r""" + Find the unicode-equivalent symbol for a string named character. + + Before calling we have matched the text between "\[" and "]" of the input. + + The name character is thus in source_text[start:finish]. + + Match this string with the known named characters, + e.g. "Theta". If we can match this, then we return the unicode equivalent from the + `named_characters` map (which is read in from JSON but stored in a YAML file). + + If we can't find the named character, rasie NamedCharacterSyntaxError. """ named_character = source_text[start:finish] if named_character.isalpha(): @@ -63,9 +65,9 @@ def parse_named_character(source_text: str, start: int, finish: int) -> Optional def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: - """ - Given some source text `source_text` at position `pos`, return the escape sequence and the - follow-on position. + """Given some source text in `source_text` starting at offset + `pos`, return the escape-sequence value for this text and the + follow-on offset position. """ result = "" c = source_text[pos] @@ -98,12 +100,14 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: if i == len(source_text): # Note: named characters do not have \n's in them. (Is this right)? # FIXME: decide what to do here. - raise EscapeSyntaxError("Syntax", "stresc" rf"\{c}.") + raise NamedCharacterSyntaxError("Syntax", "sntufn", source_text[pos:]) named_character = parse_named_character(source_text, pos, i) - if named_character is not None: - result += named_character - pos = i + 1 + if named_character is None: + raise NamedCharacterSyntaxError("Syntax", "sntufn", source_text[pos:i]) + + result += named_character + pos = i + 1 elif c in "01234567": # See if we have a 3-digit octal number. # For example \065 = "5" diff --git a/mathics_scanner/mathics3_tokens.py b/mathics_scanner/mathics3_tokens.py index 8f06db74..db6a163c 100644 --- a/mathics_scanner/mathics3_tokens.py +++ b/mathics_scanner/mathics3_tokens.py @@ -181,7 +181,6 @@ def interactive_eval_loop(shell: TerminalShell, code_tokenize_format: bool): "sntufn", "Unknown unicode longname", ) - except KeyboardInterrupt: print("\nKeyboardInterrupt. Type Ctrl-D (EOF) to exit.") except EOFError: diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 8f8a0e7a..9543b19d 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -684,7 +684,11 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: self.get_more_input() self.pos += 1 source_text += self.source_text - escape_str, self.pos = parse_escape_sequence(source_text, start_pos) + try: + escape_str, self.pos = parse_escape_sequence(source_text, start_pos) + except ScanError as scan_error: + self.feeder.message("Syntax", scan_error.tag, scan_error.args[0]) + raise # DRY with "next()" # look for a matching pattern diff --git a/test/test_escape_sequences.py b/test/test_escape_sequences.py index 86ef583b..b5b94201 100644 --- a/test/test_escape_sequences.py +++ b/test/test_escape_sequences.py @@ -1,4 +1,7 @@ # -*- coding: utf-8 -*- +import pytest + +from mathics_scanner.errors import NamedCharacterSyntaxError from mathics_scanner.escape_sequences import parse_escape_sequence @@ -35,3 +38,9 @@ def test_escape_sequences(): ("[Integral]", 0, 10, "\u222b", "Another full-string named-character"), ): assert parse_escape_sequence(text, pos) == (expect_str, expect_pos), fail_msg + + +def test_incomplete_named_character_sequences(): + for text in (r"\[", r"\[Theta", r"\[Fake]", r"\[abc]"): + with pytest.raises(NamedCharacterSyntaxError): + parse_escape_sequence(text, 1) From 3c1b9770d11862cdd9d65f606300ab1352769aa5 Mon Sep 17 00:00:00 2001 From: rocky Date: Wed, 16 Apr 2025 22:00:58 -0400 Subject: [PATCH 10/34] Improve error handling... and add more tests. --- mathics_scanner/escape_sequences.py | 26 +++++++++++++++----------- test/test_escape_sequences.py | 28 ++++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py index 913acaf0..2e8c8fed 100644 --- a/mathics_scanner/escape_sequences.py +++ b/mathics_scanner/escape_sequences.py @@ -21,21 +21,25 @@ def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) -> However, if the conversion fails, ScanError is raised. """ - assert start_shift <= end_shift <= len(source_text) + last = end_shift - start_shift + if last == 2: + tag = "sntoct2" + elif last == 3: + assert base == 8, "Only octal requires 3 digits" + tag = "sntoct1" + elif last in (4, 6): + tag = "snthex" + else: + raise ValueError() + + if end_shift > len(source_text): + raise ScanError("Syntax", tag) + + assert start_shift <= end_shift text = source_text[start_shift:end_shift] try: result = int(text, base) except ValueError: - last = start_shift - end_shift - if last == 2: - tag = "sntoct2" - elif last == 3: - assert base == 8, "Only octal requires 3 digits" - tag = "sntoct1" - elif last == 4: - tag = "snthex" - else: - raise ValueError() raise ScanError(tag, source_text[start_shift:].rstrip("\n")) return chr(result) diff --git a/test/test_escape_sequences.py b/test/test_escape_sequences.py index b5b94201..1547d066 100644 --- a/test/test_escape_sequences.py +++ b/test/test_escape_sequences.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import pytest -from mathics_scanner.errors import NamedCharacterSyntaxError +from mathics_scanner.errors import NamedCharacterSyntaxError, ScanError from mathics_scanner.escape_sequences import parse_escape_sequence @@ -40,7 +40,31 @@ def test_escape_sequences(): assert parse_escape_sequence(text, pos) == (expect_str, expect_pos), fail_msg -def test_incomplete_named_character_sequences(): +def test_invalid_named_character_sequences(): for text in (r"\[", r"\[Theta", r"\[Fake]", r"\[abc]"): with pytest.raises(NamedCharacterSyntaxError): parse_escape_sequence(text, 1) + + +def test_invalid_number_encoding(): + for text in ( + # Octal + "093", # 9 is not in 0-7 + "01", # need 3 characters + "01", # need 3 characters + # 2-character hex + ".", + ".0", + ".0i", # i is not in 0-f + # 4-character hex + ":", + ":A", + ":A1", + ":ak", + ":A10", + ":a1g", + ":A1g9", + ":01-2", + ): + with pytest.raises(ScanError): + parse_escape_sequence(text, 0) From ded888503971256e531e1b56fa8753cccdbf6b39 Mon Sep 17 00:00:00 2001 From: rocky Date: Wed, 14 May 2025 12:27:02 -0400 Subject: [PATCH 11/34] Improve scanner... named-characters.yml: \[Mu] is letterlike tokeniser.py: Correct identifier or pattern for those having letterlike escape sequences --- mathics_scanner/tokeniser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 9543b19d..e6f82b5c 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -703,7 +703,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: break else: for tag, pattern in self.tokens: - pattern_match = pattern.match(escape_str, start_pos) + pattern_match = pattern.match(escape_str, 0) if pattern_match is not None: break From 41fdc74745bcaab3623e88b7f6da1fe0379da2cd Mon Sep 17 00:00:00 2001 From: rocky Date: Fri, 16 May 2025 13:07:47 -0400 Subject: [PATCH 12/34] Handle EscapSequence errors better --- mathics_scanner/escape_sequences.py | 2 +- mathics_scanner/tokeniser.py | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py index 2e8c8fed..dd420e2e 100644 --- a/mathics_scanner/escape_sequences.py +++ b/mathics_scanner/escape_sequences.py @@ -139,5 +139,5 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: result += c pos += 1 else: - raise EscapeSyntaxError("Syntax", "stresc" rf"\{c}.") + raise EscapeSyntaxError("Syntax", "stresc", rf"\{c}") return result, pos diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index e6f82b5c..481a2520 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -11,7 +11,7 @@ from typing import Dict, List, Optional, Tuple from mathics_scanner.characters import _letterlikes, _letters -from mathics_scanner.errors import IncompleteSyntaxError, ScanError +from mathics_scanner.errors import EscapeSyntaxError, IncompleteSyntaxError, ScanError from mathics_scanner.escape_sequences import parse_escape_sequence try: @@ -743,10 +743,15 @@ def t_String(self, _: re.Match) -> Token: if char == "\\": if self.pos + 1 == len(source_text): # We have reached end of the input line before seeing a terminating - # quote ("). Fetch aanother line. + # quote ("). Fetch another line. self.get_more_input() self.pos += 1 - escape_str, self.pos = parse_escape_sequence(source_text, self.pos) + try: + escape_str, self.pos = parse_escape_sequence(source_text, self.pos) + except EscapeSyntaxError as e: + self.feeder.message(e.name, *e.args) + raise + result += escape_str else: result += self.source_text[self.pos] From fa9b1a9c4a7ac6a865964e50fd161d79640b90bc Mon Sep 17 00:00:00 2001 From: rocky Date: Fri, 16 May 2025 22:32:46 -0400 Subject: [PATCH 13/34] Handle embedded escape sequences in Symbols... and also add Theta to the list of letterlike symbols --- mathics_scanner/tokeniser.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 481a2520..14c6d46f 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -606,6 +606,27 @@ def next(self) -> Token: # pattern match. text = pattern_match.group(0) self.pos = pattern_match.end(0) + + if tag == "Symbol": + # We have to keep searching for the end of the Symbol if + # the next symbol is a backslash, "\", because it might be a + # named-letterlike character such as \[Mu] or a escape representation of number or + # character. + # abc\[Mu] is a valid 4-character symbol. + while self.pos < len(source_text) and source_text[self.pos] == "\\": + try: + escape_str, next_pos = parse_escape_sequence( + self.source_text, self.pos + 1 + ) + except ScanError as scan_error: + self.feeder.message("Syntax", scan_error.tag, scan_error.args[0]) + raise + if escape_str in _letterlikes + "0123456789": + text += escape_str + self.pos = next_pos + else: + break + return Token(tag, text, pattern_match.start(0)) def _skip_blank(self): @@ -690,7 +711,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: self.feeder.message("Syntax", scan_error.tag, scan_error.args[0]) raise - # DRY with "next()" + # DRY with "next()?" # look for a matching pattern indices = self.token_indices.get(escape_str[0], ()) pattern_match = None From 8c582f5d288c2d5fbc9e3b2889e4d50378fae73f Mon Sep 17 00:00:00 2001 From: rocky Date: Sat, 17 May 2025 20:00:48 -0400 Subject: [PATCH 14/34] WIP - bang on Symbol tokenization with backslash Replace .format() with f-strings. Add comments around Symbol pattern. sntx_message() Excpetion now saves name, tag, and args --- mathics_scanner/tokeniser.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 14c6d46f..06cf42ae 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -612,7 +612,8 @@ def next(self) -> Token: # the next symbol is a backslash, "\", because it might be a # named-letterlike character such as \[Mu] or a escape representation of number or # character. - # abc\[Mu] is a valid 4-character symbol. + # abc\[Mu] is a valid 4-character Symbol. And we can have things like + # abc\[Mu]\[Mu]def\[Mu]1 while self.pos < len(source_text) and source_text[self.pos] == "\\": try: escape_str, next_pos = parse_escape_sequence( @@ -711,8 +712,9 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: self.feeder.message("Syntax", scan_error.tag, scan_error.args[0]) raise - # DRY with "next()?" - # look for a matching pattern + # Is there a way to DRY with "next()?" + + # Look for a matching pattern. indices = self.token_indices.get(escape_str[0], ()) pattern_match = None tag = "??invalid" @@ -728,12 +730,34 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: if pattern_match is not None: break - # no matching pattern found + # No matching pattern found. if pattern_match is None: tag, pre, post = self.sntx_message() raise ScanError(tag, pre, post) text = pattern_match.group(0) + + if tag == "Symbol": + # We have to keep searching for the end of the Symbol if + # the next symbol is a backslash, "\", because it might be a + # named-letterlike character such as \[Mu] or a escape representation of number or + # character. + # \[Mu]2 is a valid 2-character Symbol, and we can have things like + # \[Mu]\[Mu]def\[Mu]1. + while self.pos < len(source_text) and source_text[self.pos] == "\\": + try: + escape_str, next_pos = parse_escape_sequence( + self.source_text, self.pos + 1 + ) + except ScanError as scan_error: + self.feeder.message("Syntax", scan_error.tag, scan_error.args[0]) + raise + if escape_str in _letterlikes + "0123456789": + text += escape_str + self.pos = next_pos + else: + break + return Token(tag, text, pattern_match.start(0)) def t_String(self, _: re.Match) -> Token: From c1c015ce7d08413f1bce611f9f6d09911aba7915 Mon Sep 17 00:00:00 2001 From: rocky Date: Sun, 18 May 2025 12:05:39 -0400 Subject: [PATCH 15/34] Be able to whether we are in a RowBox --- mathics_scanner/tokeniser.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 06cf42ae..e00bb551 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -619,10 +619,16 @@ def next(self) -> Token: escape_str, next_pos = parse_escape_sequence( self.source_text, self.pos + 1 ) - except ScanError as scan_error: - self.feeder.message("Syntax", scan_error.tag, scan_error.args[0]) + except EscapeSyntaxError as escape_error: + if self.is_inside_box: + # Follow-on symbol may be a escape character that can + # appear only in box constructs, e.g. \%. + break + self.feeder.message( + "Syntax", escape_error.tag, escape_error.args[0] + ) raise - if escape_str in _letterlikes + "0123456789": + if escape_str in _letterlikes: text += escape_str self.pos = next_pos else: From 68346c05e6910c9daea3ae3ef2589e1308ee4aed Mon Sep 17 00:00:00 2001 From: rocky Date: Sun, 18 May 2025 22:34:43 -0400 Subject: [PATCH 16/34] Handle no-meaning operators Not sure how this worked before, but it did. --- mathics_scanner/tokeniser.py | 69 ++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 19 deletions(-) diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index e00bb551..773be5e4 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -625,7 +625,7 @@ def next(self) -> Token: # appear only in box constructs, e.g. \%. break self.feeder.message( - "Syntax", escape_error.tag, escape_error.args[0] + escape_error.name, escape_error.tag, escape_error.args ) raise if escape_str in _letterlikes: @@ -706,21 +706,29 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: """Break out from ``pattern_match`` tokens which start with \\""" source_text = self.source_text start_pos = self.pos + 1 + named_character = "" if start_pos == len(source_text): - # We have reached end of the input line before seeing a terminating - # quote ("). Fetch another line. + # We have reached end of the input line before seeing a termination + # of backslash. Fetch another line. self.get_more_input() self.pos += 1 source_text += self.source_text try: escape_str, self.pos = parse_escape_sequence(source_text, start_pos) - except ScanError as scan_error: - self.feeder.message("Syntax", scan_error.tag, scan_error.args[0]) + if source_text[start_pos] == "[" and source_text[self.pos - 1] == "]": + named_character = source_text[start_pos + 1 : self.pos - 1] + except EscapeSyntaxError as escape_error: + self.feeder.message(escape_error.name, escape_error.tag, escape_error.args) raise # Is there a way to DRY with "next()?" - # Look for a matching pattern. + if named_character != "": + if named_character in NO_MEANING_OPERATORS: + return Token(named_character, escape_str, start_pos - 1) + + # Look for a pattern matching leading context \. + indices = self.token_indices.get(escape_str[0], ()) pattern_match = None tag = "??invalid" @@ -736,7 +744,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: if pattern_match is not None: break - # No matching pattern found. + # No matching found. if pattern_match is None: tag, pre, post = self.sntx_message() raise ScanError(tag, pre, post) @@ -744,23 +752,44 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: text = pattern_match.group(0) if tag == "Symbol": - # We have to keep searching for the end of the Symbol if - # the next symbol is a backslash, "\", because it might be a - # named-letterlike character such as \[Mu] or a escape representation of number or - # character. - # \[Mu]2 is a valid 2-character Symbol, and we can have things like - # \[Mu]\[Mu]def\[Mu]1. - while self.pos < len(source_text) and source_text[self.pos] == "\\": + # We have to keep searching for the end of the Symbol + # after an escaped letterlike-symbol. For example, \[Mu] + # is a valid Symbol. But we can also have symbols for + # \[Mu]\[Theta], \[Mu]1, \[Mu]1a, \[Mu]\.42, \[Mu]\061, or \[Mu]\061abc + while True: + if self.pos >= len(source_text): + break + + # Try to extend symbol with non-escaped alphanumeric + # (and letterlike) symbols. + + # TODO: Do we need to add context breaks? And if so, + # do we need to check for consecutive ``'s? + alphanumeric_match = re.match( + f"[0-9${symbol_first_letter}]+", self.source_text[self.pos :] + ) + if alphanumeric_match is not None: + extension_str = alphanumeric_match.group(0) + text += extension_str + self.pos += len(extension_str) + + if source_text[self.pos] != "\\": + break + + # Now try to extend symbol with *escaped* alphanumeric (and letterlike) symbols. try: escape_str, next_pos = parse_escape_sequence( self.source_text, self.pos + 1 ) - except ScanError as scan_error: - self.feeder.message("Syntax", scan_error.tag, scan_error.args[0]) + except EscapeSyntaxError as escape_error: + self.feeder.message( + escape_error.name, escape_error.tag, escape_error.args + ) raise - if escape_str in _letterlikes + "0123456789": + if escape_str in _letterlikes + _letters + "0123456789$": text += escape_str self.pos = next_pos + # Look to extend the symbol for further else: break @@ -799,8 +828,10 @@ def t_String(self, _: re.Match) -> Token: self.pos += 1 try: escape_str, self.pos = parse_escape_sequence(source_text, self.pos) - except EscapeSyntaxError as e: - self.feeder.message(e.name, *e.args) + except EscapeSyntaxError as escape_error: + self.feeder.message( + escape_error.name, escape_error.tag, escape_error.args + ) raise result += escape_str From 3fe6a2beb3b74e97e50456d3ff8fdaccbaf074cb Mon Sep 17 00:00:00 2001 From: rocky Date: Mon, 19 May 2025 11:25:34 -0400 Subject: [PATCH 17/34] WIP misc fixes... * "$\" is a thing * Correct EscapeSyntaxError error message * Better Symbol tokenization for things like a\[Mu]1. More in next commit though. --- mathics_scanner/escape_sequences.py | 7 ++++-- mathics_scanner/tokeniser.py | 35 ++++++++++++++++++++++++----- 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py index dd420e2e..61c56b28 100644 --- a/mathics_scanner/escape_sequences.py +++ b/mathics_scanner/escape_sequences.py @@ -122,7 +122,7 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: # Note that these are a similer to Python, but are different. # In particular, Python defines "\a" to be ^G (control G), # but in WMA, this is invalid. - elif c in "ntbfr\n": + elif c in "ntbfr$\n": if c in "n\n": result += "\n" elif c == "t": @@ -131,6 +131,9 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: result += "\b" elif c == "f": result += "\f" + elif c == "$": + # I don't know why \$ is defined, but it is! + result += r"\$" else: assert c == "r" result += "\r" @@ -139,5 +142,5 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: result += c pos += 1 else: - raise EscapeSyntaxError("Syntax", "stresc", rf"\{c}") + raise EscapeSyntaxError("stresc", rf"\{c}") return result, pos diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 773be5e4..06fe9545 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -607,6 +607,7 @@ def next(self) -> Token: text = pattern_match.group(0) self.pos = pattern_match.end(0) + # FIXME: DRY with code in RawBackslash if tag == "Symbol": # We have to keep searching for the end of the Symbol if # the next symbol is a backslash, "\", because it might be a @@ -614,7 +615,26 @@ def next(self) -> Token: # character. # abc\[Mu] is a valid 4-character Symbol. And we can have things like # abc\[Mu]\[Mu]def\[Mu]1 - while self.pos < len(source_text) and source_text[self.pos] == "\\": + while True: + if self.pos >= len(source_text): + break + + # Try to extend symbol with non-escaped alphanumeric + # (and letterlike) symbols. + + # TODO: Do we need to add context breaks? And if so, + # do we need to check for consecutive ``'s? + alphanumeric_match = re.match( + f"[0-9${symbol_first_letter}]+", self.source_text[self.pos :] + ) + if alphanumeric_match is not None: + extension_str = alphanumeric_match.group(0) + text += extension_str + self.pos += len(extension_str) + + if source_text[self.pos] != "\\": + break + try: escape_str, next_pos = parse_escape_sequence( self.source_text, self.pos + 1 @@ -625,7 +645,7 @@ def next(self) -> Token: # appear only in box constructs, e.g. \%. break self.feeder.message( - escape_error.name, escape_error.tag, escape_error.args + escape_error.name, escape_error.tag, *escape_error.args ) raise if escape_str in _letterlikes: @@ -718,7 +738,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: if source_text[start_pos] == "[" and source_text[self.pos - 1] == "]": named_character = source_text[start_pos + 1 : self.pos - 1] except EscapeSyntaxError as escape_error: - self.feeder.message(escape_error.name, escape_error.tag, escape_error.args) + self.feeder.message(escape_error.name, escape_error.tag, *escape_error.args) raise # Is there a way to DRY with "next()?" @@ -752,6 +772,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: text = pattern_match.group(0) if tag == "Symbol": + # FIXME: DRY with code in next() # We have to keep searching for the end of the Symbol # after an escaped letterlike-symbol. For example, \[Mu] # is a valid Symbol. But we can also have symbols for @@ -776,12 +797,15 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: if source_text[self.pos] != "\\": break - # Now try to extend symbol with *escaped* alphanumeric (and letterlike) symbols. try: escape_str, next_pos = parse_escape_sequence( self.source_text, self.pos + 1 ) except EscapeSyntaxError as escape_error: + if self.is_inside_box: + # Follow-on symbol may be a escape character that can + # appear only in box constructs, e.g. \%. + break self.feeder.message( escape_error.name, escape_error.tag, escape_error.args ) @@ -789,7 +813,6 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: if escape_str in _letterlikes + _letters + "0123456789$": text += escape_str self.pos = next_pos - # Look to extend the symbol for further else: break @@ -830,7 +853,7 @@ def t_String(self, _: re.Match) -> Token: escape_str, self.pos = parse_escape_sequence(source_text, self.pos) except EscapeSyntaxError as escape_error: self.feeder.message( - escape_error.name, escape_error.tag, escape_error.args + escape_error.name, escape_error.tag, *escape_error.args ) raise From 17192927523c7d938b61a67f092361e2a9f0365e Mon Sep 17 00:00:00 2001 From: rocky Date: Mon, 19 May 2025 12:02:03 -0400 Subject: [PATCH 18/34] Better Symbol-name extension test... for things like \.78\.79 Imporve comments around DRYing identifier/symbol_name extension --- mathics_scanner/tokeniser.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 06fe9545..1cee58ab 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -607,7 +607,13 @@ def next(self) -> Token: text = pattern_match.group(0) self.pos = pattern_match.end(0) - # FIXME: DRY with code in RawBackslash + # The below similar to what we do in t_RawBackslash, but is is + # different. First, we need to look for a closing quote + # ("). Also, after parsing escape sequences, we can + # unconditionallhy add them on to the string. That is, we + # don't have to check whether the returned string can be valid + # in a Symbol name. + if tag == "Symbol": # We have to keep searching for the end of the Symbol if # the next symbol is a backslash, "\", because it might be a @@ -741,7 +747,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: self.feeder.message(escape_error.name, escape_error.tag, *escape_error.args) raise - # Is there a way to DRY with "next()?" + # Is there a way to DRY with "next()? if named_character != "": if named_character in NO_MEANING_OPERATORS: @@ -771,6 +777,9 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: text = pattern_match.group(0) + # Is there a way to DRY with t_String?" + # See t_String for differences. + if tag == "Symbol": # FIXME: DRY with code in next() # We have to keep searching for the end of the Symbol @@ -787,7 +796,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: # TODO: Do we need to add context breaks? And if so, # do we need to check for consecutive ``'s? alphanumeric_match = re.match( - f"[0-9${symbol_first_letter}]+", self.source_text[self.pos :] + f"[0-9${symbol_first_letter}]+", source_text[self.pos :] ) if alphanumeric_match is not None: extension_str = alphanumeric_match.group(0) @@ -810,7 +819,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: escape_error.name, escape_error.tag, escape_error.args ) raise - if escape_str in _letterlikes + _letters + "0123456789$": + if re.match(base_symbol_pattern, escape_str): text += escape_str self.pos = next_pos else: @@ -828,8 +837,16 @@ def t_String(self, _: re.Match) -> Token: newlines = [] source_text = self.source_text result = "" + + # The below similar to what we do in t_RawBackslash, but is is + # different. First, we need to look for a closing quote + # ("). Also, after parsing escape sequences, we can + # unconditionallhy add them on to the string. That is, we + # don't have to check whether the returned string can be valid + # in a Symbol name. + while True: - if self.pos >= len(self.source_text): + if self.pos >= len(source_text): if end is None: # reached end while still inside string self.get_more_input() From 42a3e8d1e025d7f5dcbadc466b13ba8fe6a473cd Mon Sep 17 00:00:00 2001 From: rocky Date: Tue, 20 May 2025 17:30:22 -0400 Subject: [PATCH 19/34] WIP - small tweaks before moving master forward This PR has gotten out of hand in size, we'll break it up into smaller chunks. --- mathics_scanner/escape_sequences.py | 4 +- mathics_scanner/tokeniser.py | 9 ++- test/test_prescanner.py | 96 ----------------------------- 3 files changed, 9 insertions(+), 100 deletions(-) delete mode 100644 test/test_prescanner.py diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py index 61c56b28..2f86aed0 100644 --- a/mathics_scanner/escape_sequences.py +++ b/mathics_scanner/escape_sequences.py @@ -131,9 +131,9 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: result += "\b" elif c == "f": result += "\f" - elif c == "$": + elif c in '$"': # I don't know why \$ is defined, but it is! - result += r"\$" + result += rf"\{c}" else: assert c == "r" result += "\r" diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 1cee58ab..3930ca84 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -776,6 +776,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: raise ScanError(tag, pre, post) text = pattern_match.group(0) + start_pos = pattern_match.start(0) # Is there a way to DRY with t_String?" # See t_String for differences. @@ -819,13 +820,17 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: escape_error.name, escape_error.tag, escape_error.args ) raise - if re.match(base_symbol_pattern, escape_str): + if re.match(interior_symbol_pattern, escape_str): text += escape_str self.pos = next_pos else: break - return Token(tag, text, pattern_match.start(0)) + elif tag == "String": + self.feeder.message("Syntax", "sntxi", text) + raise IncompleteSyntaxError("Syntax", "sntxi", text) + + return Token(tag, text, start_pos) def t_String(self, _: re.Match) -> Token: """Break out from self.source_text the next token which is expected to be a String. diff --git a/test/test_prescanner.py b/test/test_prescanner.py deleted file mode 100644 index 25163d26..00000000 --- a/test/test_prescanner.py +++ /dev/null @@ -1,96 +0,0 @@ -# -*- coding: utf-8 -*- -import pytest - -from mathics_scanner import IncompleteSyntaxError, ScanError -from mathics_scanner.feed import SingleLineFeeder -from mathics_scanner.prescanner import Prescanner - - -def replace_escape_sequences(mathics_text: str): - prescanner = Prescanner(SingleLineFeeder(mathics_text)) - return prescanner.replace_escape_sequences() - - -def assert_invalid(mathics_text: str): - with pytest.raises(ScanError): - replace_escape_sequences(mathics_text) - - -def assert_incomplete(mathics_text: str): - with pytest.raises(IncompleteSyntaxError): - replace_escape_sequences(mathics_text) - - -def assert_equal(mathics_text: str, result: str): - assert replace_escape_sequences(mathics_text) == result - - -def assert_equal_length(mathics_text: str, length): - assert len(replace_escape_sequences(mathics_text)) == length - - -@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner") -def test_named_characters(): - assert_equal(r"\[Theta]", "\u03B8") - assert_equal(r"\[CapitalPi]", "\u03A0") - assert_equal(r"\[Fake]", r"\[Fake]") - assert_equal("z \\[Conjugate]", "z \uF3C8") - assert_equal("z \\[Integral]", "z \u222b") - assert_equal("z \\\\[Integral]", "z \\\\[Integral]") - assert_equal("z \\\\\\[Integral]", "z \\\\\u222b") - assert_equal("abc\\\\", "abc\\\\") - - -@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner") -def test_text_lengths(): - assert_equal_length(r'"\[Integral]"', 3) - # Prescanner keep both slashes and quotes. - # The tokenizer brings \\ into \ if it appears - # inside a string. - assert_equal_length(r'"\\[Integral]"', 14) - - -@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner") -def test_incomplete(): - assert_incomplete(r"\[") - assert_incomplete(r"\[Theta") - - -@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner") -def test_invalid_octal(): - assert_invalid(r"\093") - assert_invalid(r"\01") - - -@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner") -def test_invalid_colon(): - assert_invalid(r"\:") - assert_invalid(r"\:A") - assert_invalid(r"\:01") - assert_invalid(r"\:A1") - assert_invalid(r"\:ak") - assert_invalid(r"\:A10") - assert_invalid(r"\:a1g") - assert_invalid(r"\:A1g9") - assert_invalid(r"\:01-2") - - -@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner") -def test_invalid_dot(): - assert_invalid(r"\.") - assert_invalid(r"\.0") - - -@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner") -def test_combined(): - assert_equal(r"\:03B8\[Theta]\.30\052", "\u03B8\u03B80*") - - -@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner") -def test_nested(): - assert_equal(r"\[Thet\141]", r"\[Thet\141]") - - -@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner") -def test_trailing_backslash(): - assert_incomplete("x \\") From 9c596becdcca0cb2639c395decd5d4c1a5196dd0 Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 29 May 2025 12:19:30 -0400 Subject: [PATCH 20/34] Small bugs related to escape-character handling NamedChracterSyntax should be a new-style TranslateError self.code -> self.source_text misc sntx_message() fixes. Document better. --- mathics_scanner/errors.py | 2 +- mathics_scanner/feed.py | 6 ++-- mathics_scanner/tokeniser.py | 69 ++++++++++++++++++++++-------------- 3 files changed, 46 insertions(+), 31 deletions(-) diff --git a/mathics_scanner/errors.py b/mathics_scanner/errors.py index 212a82e3..9210878c 100644 --- a/mathics_scanner/errors.py +++ b/mathics_scanner/errors.py @@ -34,7 +34,7 @@ class InvalidSyntaxError(TranslateErrorNew): pass -class NamedCharacterSyntaxError(TranslateError): +class NamedCharacterSyntaxError(TranslateErrorNew): """Named character syntax error""" pass diff --git a/mathics_scanner/feed.py b/mathics_scanner/feed.py index 8ae0717f..9674ae53 100644 --- a/mathics_scanner/feed.py +++ b/mathics_scanner/feed.py @@ -130,14 +130,14 @@ def empty(self) -> bool: class SingleLineFeeder(LineFeeder): "A feeder that feeds all the code as a single line." - def __init__(self, code: str, filename=""): + def __init__(self, source_text: str, filename=""): """ :param code: The source of the feeder (a string). :param filename: A string that describes the source of the feeder, i.e. the filename that is being feed. """ super().__init__(filename) - self.code = code + self.source_text = source_text self._empty = False def feed(self) -> str: @@ -145,7 +145,7 @@ def feed(self) -> str: return "" self._empty = True self.lineno += 1 - return self.code + return self.source_text def empty(self) -> bool: return self._empty diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 3930ca84..c37ca71c 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -11,7 +11,13 @@ from typing import Dict, List, Optional, Tuple from mathics_scanner.characters import _letterlikes, _letters -from mathics_scanner.errors import EscapeSyntaxError, IncompleteSyntaxError, ScanError +from mathics_scanner.errors import ( + EscapeSyntaxError, + IncompleteSyntaxError, + InvalidSyntaxError, + NamedCharacterSyntaxError, + ScanError, +) from mathics_scanner.escape_sequences import parse_escape_sequence try: @@ -553,19 +559,29 @@ def is_inside_box(self) -> bool: def is_inside_box(self, value: bool) -> None: self._is_inside_box = value - def sntx_message(self, pos: Optional[int] = None) -> Tuple[str, str, str]: - """ - Send a "sntx{b,f} error message to the input-reading feeder. + def sntx_message(self, start_pos: Optional[int] = None) -> Tuple[str, int, int]: + """Send a "sntx{b,f} error message to the input-reading + feeder. + + The tag ("sntxb" or "sntxf"), position of the error, and blank-stripped + position to the end line are returned. """ - if pos is None: - pos = self.pos - pre, post = self.source_text[:pos], self.source_text[pos:].rstrip("\n") - if pos == 0: - self.feeder.message("Syntax", "sntxb", pre, post) - return "sntxb", pre, post + if start_pos is None: + start_pos = self.pos + trailing_fragment = self.source_text[start_pos:].strip() + end_pos = start_pos + len(trailing_fragment) + if start_pos == 0: + self.feeder.message("Syntax", "sntxb", trailing_fragment) + tag = "sntxb" else: - self.feeder.message("Syntax", "sntxf", pre, post) - return "sntxf", pre, post + self.feeder.message( + "Syntax", + "sntxf", + self.source_text[:start_pos].strip(), + trailing_fragment, + ) + tag = "syntx" + return tag, start_pos, end_pos # TODO: If this is converted this to __next__, then # a tokeniser object is iterable. @@ -573,6 +589,7 @@ def next(self) -> Token: "Returns the next token from self.source_text." self._skip_blank() source_text = self.source_text + if self.pos >= len(source_text): return Token("END", "", len(source_text)) @@ -645,7 +662,7 @@ def next(self) -> Token: escape_str, next_pos = parse_escape_sequence( self.source_text, self.pos + 1 ) - except EscapeSyntaxError as escape_error: + except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error: if self.is_inside_box: # Follow-on symbol may be a escape character that can # appear only in box constructs, e.g. \%. @@ -739,16 +756,16 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: self.get_more_input() self.pos += 1 source_text += self.source_text + try: escape_str, self.pos = parse_escape_sequence(source_text, start_pos) if source_text[start_pos] == "[" and source_text[self.pos - 1] == "]": named_character = source_text[start_pos + 1 : self.pos - 1] - except EscapeSyntaxError as escape_error: + except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error: self.feeder.message(escape_error.name, escape_error.tag, *escape_error.args) raise # Is there a way to DRY with "next()? - if named_character != "": if named_character in NO_MEANING_OPERATORS: return Token(named_character, escape_str, start_pos - 1) @@ -811,7 +828,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: escape_str, next_pos = parse_escape_sequence( self.source_text, self.pos + 1 ) - except EscapeSyntaxError as escape_error: + except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error: if self.is_inside_box: # Follow-on symbol may be a escape character that can # appear only in box constructs, e.g. \%. @@ -828,16 +845,16 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: elif tag == "String": self.feeder.message("Syntax", "sntxi", text) - raise IncompleteSyntaxError("Syntax", "sntxi", text) + raise InvalidSyntaxError("Syntax", "sntxi", text) return Token(tag, text, start_pos) - def t_String(self, _: re.Match) -> Token: + def t_String(self, _: Optional[re.Match]) -> Token: """Break out from self.source_text the next token which is expected to be a String. The string value of the returned token will have double quote (") in the first and last postions of the returned string. """ - start, end = self.pos, None + end = None self.pos += 1 # skip opening '"' newlines = [] source_text = self.source_text @@ -846,7 +863,7 @@ def t_String(self, _: re.Match) -> Token: # The below similar to what we do in t_RawBackslash, but is is # different. First, we need to look for a closing quote # ("). Also, after parsing escape sequences, we can - # unconditionallhy add them on to the string. That is, we + # unconditionally add them on to the string. That is, we # don't have to check whether the returned string can be valid # in a Symbol name. @@ -873,7 +890,7 @@ def t_String(self, _: re.Match) -> Token: self.pos += 1 try: escape_str, self.pos = parse_escape_sequence(source_text, self.pos) - except EscapeSyntaxError as escape_error: + except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error: self.feeder.message( escape_error.name, escape_error.tag, *escape_error.args ) @@ -884,12 +901,10 @@ def t_String(self, _: re.Match) -> Token: result += self.source_text[self.pos] self.pos += 1 - indices = [start] + newlines + [end] - result = "".join( - self.source_text[indices[i] : indices[i + 1]] - for i in range(len(indices) - 1) - ) - return Token("String", result, start) + # FIXME: rethink whether we really need quotes at the beginning and + # and of a string and redo. This will include revising whatever calls + # parser.unescape string(). + return Token("String", f'"{result}"', self.pos) # Call the function that initializes the dictionaries. From 74587ccde9ec19640e84aa02c563b1c4e2a3d42c Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 29 May 2025 12:47:57 -0400 Subject: [PATCH 21/34] Use git branch for testing Mathics --- .github/workflows/mathics.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mathics.yml b/.github/workflows/mathics.yml index 899075c6..cca93e00 100644 --- a/.github/workflows/mathics.yml +++ b/.github/workflows/mathics.yml @@ -33,7 +33,7 @@ jobs: git clone --depth 1 https://github.com/Mathics3/mathics-scanner.git (cd mathics-scanner && pip install -e .) # Until next Mathics3/mathics-core release is out... - git clone --depth 1 https://github.com/Mathics3/mathics-core.git + git clone --depth 1 --branch revise-escape-sequence-scanning https://github.com/Mathics3/mathics-core.git cd mathics-core/ make PIP_INSTALL_OPTS='[full]' # pip install Mathics3[full] From 25f56720f8ac88278d83523962f9993b45c3bbd4 Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 29 May 2025 17:09:55 -0400 Subject: [PATCH 22/34] Revise Scanner error exception class TranslateError, TranslateErrorNew, ScanError now become ScannerError --- docs/source/api.rst | 2 +- mathics_scanner/__init__.py | 8 ++------ mathics_scanner/errors.py | 29 +++++++++++------------------ mathics_scanner/escape_sequences.py | 8 ++++---- mathics_scanner/mathics3_tokens.py | 23 ++++++++++++++--------- mathics_scanner/tokeniser.py | 6 +++--- test/test_escape_sequences.py | 4 ++-- test/test_tokeniser.py | 12 ++++++++---- 8 files changed, 45 insertions(+), 47 deletions(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index 7368a54b..c651554d 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -10,7 +10,7 @@ Tokenization Tokenization is performed by the ``Tokeniser`` class. The ``next`` method consumes characters from a feeder and returns a token if the tokenization -succeeds. If the tokenization fails an instance of ``TranslateError`` is +succeeds. If the tokenization fails an instance of ``ScannerError`` is raised. .. autoclass:: Tokeniser(object) diff --git a/mathics_scanner/__init__.py b/mathics_scanner/__init__.py index 5d9def81..af14255f 100644 --- a/mathics_scanner/__init__.py +++ b/mathics_scanner/__init__.py @@ -15,9 +15,7 @@ from mathics_scanner.errors import ( IncompleteSyntaxError, InvalidSyntaxError, - ScanError, - TranslateError, - TranslateErrorNew, + ScannerError, ) from mathics_scanner.feed import ( FileLineFeeder, @@ -36,12 +34,10 @@ "InvalidSyntaxError", "LineFeeder", "MultiLineFeeder", - "ScanError", + "ScannerError", "SingleLineFeeder", # "Token", # "Tokeniser", - "TranslateError", - "TranslateErrorNew", "__version__", "aliased_characters", # "is_symbol_name", diff --git a/mathics_scanner/errors.py b/mathics_scanner/errors.py index 9210878c..13e8e1eb 100644 --- a/mathics_scanner/errors.py +++ b/mathics_scanner/errors.py @@ -1,7 +1,13 @@ # -*- coding: utf-8 -*- -class TranslateErrorNew(Exception): +class ScannerError(Exception): + """Some sort of error in the scanning or tokenization phase parsing Mathics3. + + There are more specific kinds of exceptions subclassed from this + exception class. + """ + def __init__(self, tag: str, *args): super().__init__() self.name = "Syntax" @@ -9,38 +15,25 @@ def __init__(self, tag: str, *args): self.args = args -class TranslateError(Exception): - """ - A generic class of tokenization errors. This exception is subclassed by other - tokenization errors - """ - - -class EscapeSyntaxError(TranslateErrorNew): +class EscapeSyntaxError(ScannerError): """Escape sequence syntax error""" pass -class IncompleteSyntaxError(TranslateErrorNew): +class IncompleteSyntaxError(ScannerError): """More characters were expected to form a valid token""" pass -class InvalidSyntaxError(TranslateErrorNew): +class InvalidSyntaxError(ScannerError): """Invalid syntax""" pass -class NamedCharacterSyntaxError(TranslateErrorNew): +class NamedCharacterSyntaxError(EscapeSyntaxError): """Named character syntax error""" pass - - -class ScanError(TranslateErrorNew): - """A generic scanning error""" - - pass diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py index 2f86aed0..bfd8f2f9 100644 --- a/mathics_scanner/escape_sequences.py +++ b/mathics_scanner/escape_sequences.py @@ -8,7 +8,7 @@ from mathics_scanner.errors import ( EscapeSyntaxError, NamedCharacterSyntaxError, - ScanError, + ScannerError, ) @@ -19,7 +19,7 @@ def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) -> If so, chr(integer value converted from base) is returnd. - However, if the conversion fails, ScanError is raised. + However, if the conversion fails, ScannerError is raised. """ last = end_shift - start_shift if last == 2: @@ -33,14 +33,14 @@ def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) -> raise ValueError() if end_shift > len(source_text): - raise ScanError("Syntax", tag) + raise ScannerError("Syntax", tag) assert start_shift <= end_shift text = source_text[start_shift:end_shift] try: result = int(text, base) except ValueError: - raise ScanError(tag, source_text[start_shift:].rstrip("\n")) + raise ScannerError(tag, source_text[start_shift:].rstrip("\n")) return chr(result) diff --git a/mathics_scanner/mathics3_tokens.py b/mathics_scanner/mathics3_tokens.py index db6a163c..8403e95e 100644 --- a/mathics_scanner/mathics3_tokens.py +++ b/mathics_scanner/mathics3_tokens.py @@ -10,7 +10,7 @@ from mathics_scanner.errors import ( EscapeSyntaxError, NamedCharacterSyntaxError, - ScanError, + ScannerError, ) from mathics_scanner.feed import FileLineFeeder, LineFeeder, SingleLineFeeder from mathics_scanner.tokeniser import Tokeniser @@ -162,25 +162,30 @@ def interactive_eval_loop(shell: TerminalShell, code_tokenize_format: bool): try: source_text = shell.feed() tokens(source_text, code_tokenize_format) - except ScanError: - shell.errmsg( - "Syntax", - "sntxi", - "Expression error", - ) - pass except NamedCharacterSyntaxError: shell.errmsg( "Syntax", "sntufn", "Unknown unicode longname", ) + # This has to come after NamedCharacterSyntaxError + # since that is a subclass EscapeSyntaxError except EscapeSyntaxError: shell.errmsg( "Syntax", "sntufn", "Unknown unicode longname", ) + # This has to come after NamedCharacterSyntaxError and + # EscapeSyntaxError since those are subclasses of + # ScannerError + except ScannerError: + shell.errmsg( + "Syntax", + "sntxi", + "Expression error", + ) + pass except KeyboardInterrupt: print("\nKeyboardInterrupt. Type Ctrl-D (EOF) to exit.") except EOFError: @@ -199,7 +204,7 @@ def tokens(code, code_tokenize_format: bool): while True: try: token = tokeniser.next() - except ScanError as scan_error: + except ScannerError as scan_error: mess = "" if scan_error.tag == "sntoct1": mess = r"3 octal digits are required after \ to construct an 8-bit character" diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index c37ca71c..51385708 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -16,7 +16,7 @@ IncompleteSyntaxError, InvalidSyntaxError, NamedCharacterSyntaxError, - ScanError, + ScannerError, ) from mathics_scanner.escape_sequences import parse_escape_sequence @@ -612,7 +612,7 @@ def next(self) -> Token: # No matching pattern found. if pattern_match is None: tag, pre_str, post_str = self.sntx_message() - raise ScanError(tag, pre_str, post_str) + raise ScannerError(tag, pre_str, post_str) # Look for custom tokenization rules; those are defined with t_tag. override = getattr(self, "t_" + tag, None) @@ -790,7 +790,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: # No matching found. if pattern_match is None: tag, pre, post = self.sntx_message() - raise ScanError(tag, pre, post) + raise ScannerError(tag, pre, post) text = pattern_match.group(0) start_pos = pattern_match.start(0) diff --git a/test/test_escape_sequences.py b/test/test_escape_sequences.py index 1547d066..c164b60f 100644 --- a/test/test_escape_sequences.py +++ b/test/test_escape_sequences.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import pytest -from mathics_scanner.errors import NamedCharacterSyntaxError, ScanError +from mathics_scanner.errors import NamedCharacterSyntaxError, ScannerError from mathics_scanner.escape_sequences import parse_escape_sequence @@ -66,5 +66,5 @@ def test_invalid_number_encoding(): ":A1g9", ":01-2", ): - with pytest.raises(ScanError): + with pytest.raises(ScannerError): parse_escape_sequence(text, 0) diff --git a/test/test_tokeniser.py b/test/test_tokeniser.py index 96ceee8d..f118d2bb 100644 --- a/test/test_tokeniser.py +++ b/test/test_tokeniser.py @@ -9,7 +9,11 @@ import pytest -from mathics_scanner.errors import IncompleteSyntaxError, InvalidSyntaxError, ScanError +from mathics_scanner.errors import ( + IncompleteSyntaxError, + InvalidSyntaxError, + ScannerError, +) from mathics_scanner.feed import SingleLineFeeder from mathics_scanner.tokeniser import Token, Tokeniser, is_symbol_name @@ -34,8 +38,8 @@ def invalid_error(error_message: str): tokens(error_message) -def scan_error(error_message): - with pytest.raises(ScanError): +def scanner_error(error_message): + with pytest.raises(ScannerError): tokens(error_message) @@ -137,7 +141,7 @@ def test_is_symbol(): def test_accuracy(): - scan_error("1.5``") + scanner_error("1.5``") check_number("1.0``20") check_number("1.0``0") check_number("1.4``-20") From e503b3aad7cb5b54979841081cc754b782ee9e4d Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 29 May 2025 17:18:24 -0400 Subject: [PATCH 23/34] Let's use 3.12 in CI testing it should be just a little bit faster (and it is more modern) --- .github/workflows/mathics.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mathics.yml b/.github/workflows/mathics.yml index cca93e00..17c13e6e 100644 --- a/.github/workflows/mathics.yml +++ b/.github/workflows/mathics.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.11'] + python-version: ['3.12'] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} From e1b27faaa54f0530dbaa1b74e3898c087f27d287 Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 29 May 2025 17:38:51 -0400 Subject: [PATCH 24/34] Small tidying changes to comments --- mathics_scanner/tokeniser.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 51385708..f84b9cb8 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -459,7 +459,9 @@ def is_symbol_name(text: str) -> bool: class Token: """A representation of a Wolfram-Language token. - Tokens are parsed by the parser; and are used to build M-expressions. + A Token is the next level of parsing abstraction above a raw input + Mathics input string. A sequence of tokens is the input for the + Mathics3 parser. A token has a `tag`, the class or type of the token. For example: a Number, Symbol, String, File, etc. @@ -746,7 +748,7 @@ def t_PutAppend(self, pattern_match: re.Match) -> Token: return self._token_mode(pattern_match, "PutAppend", "filename") def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: - """Break out from ``pattern_match`` tokens which start with \\""" + r"""Break out from ``pattern_match`` tokens which start with a backslash, '\'.""" source_text = self.source_text start_pos = self.pos + 1 named_character = "" From c440e427e7716003cfb7f51227c038cacab58629 Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 29 May 2025 18:23:46 -0400 Subject: [PATCH 25/34] ScannerError -> SyntaxError Use more direct and simpler error class name that is is more like its other subclassed errors. --- docs/source/api.rst | 2 +- mathics_scanner/__init__.py | 4 ++-- mathics_scanner/errors.py | 8 ++++---- mathics_scanner/escape_sequences.py | 8 ++++---- mathics_scanner/mathics3_tokens.py | 8 ++++---- mathics_scanner/tokeniser.py | 6 +++--- test/test_escape_sequences.py | 4 ++-- test/test_tokeniser.py | 4 ++-- 8 files changed, 22 insertions(+), 22 deletions(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index c651554d..cb02ec89 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -10,7 +10,7 @@ Tokenization Tokenization is performed by the ``Tokeniser`` class. The ``next`` method consumes characters from a feeder and returns a token if the tokenization -succeeds. If the tokenization fails an instance of ``ScannerError`` is +succeeds. If the tokenization fails an instance of ``SyntaxError`` is raised. .. autoclass:: Tokeniser(object) diff --git a/mathics_scanner/__init__.py b/mathics_scanner/__init__.py index af14255f..ecfec276 100644 --- a/mathics_scanner/__init__.py +++ b/mathics_scanner/__init__.py @@ -15,7 +15,7 @@ from mathics_scanner.errors import ( IncompleteSyntaxError, InvalidSyntaxError, - ScannerError, + SyntaxError, ) from mathics_scanner.feed import ( FileLineFeeder, @@ -34,7 +34,7 @@ "InvalidSyntaxError", "LineFeeder", "MultiLineFeeder", - "ScannerError", + "SyntaxError", "SingleLineFeeder", # "Token", # "Tokeniser", diff --git a/mathics_scanner/errors.py b/mathics_scanner/errors.py index 13e8e1eb..98b9c169 100644 --- a/mathics_scanner/errors.py +++ b/mathics_scanner/errors.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- -class ScannerError(Exception): +class SyntaxError(Exception): """Some sort of error in the scanning or tokenization phase parsing Mathics3. There are more specific kinds of exceptions subclassed from this @@ -15,19 +15,19 @@ def __init__(self, tag: str, *args): self.args = args -class EscapeSyntaxError(ScannerError): +class EscapeSyntaxError(SyntaxError): """Escape sequence syntax error""" pass -class IncompleteSyntaxError(ScannerError): +class IncompleteSyntaxError(SyntaxError): """More characters were expected to form a valid token""" pass -class InvalidSyntaxError(ScannerError): +class InvalidSyntaxError(SyntaxError): """Invalid syntax""" pass diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py index bfd8f2f9..44295027 100644 --- a/mathics_scanner/escape_sequences.py +++ b/mathics_scanner/escape_sequences.py @@ -8,7 +8,7 @@ from mathics_scanner.errors import ( EscapeSyntaxError, NamedCharacterSyntaxError, - ScannerError, + SyntaxError, ) @@ -19,7 +19,7 @@ def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) -> If so, chr(integer value converted from base) is returnd. - However, if the conversion fails, ScannerError is raised. + However, if the conversion fails, SyntaxError is raised. """ last = end_shift - start_shift if last == 2: @@ -33,14 +33,14 @@ def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) -> raise ValueError() if end_shift > len(source_text): - raise ScannerError("Syntax", tag) + raise SyntaxError("Syntax", tag) assert start_shift <= end_shift text = source_text[start_shift:end_shift] try: result = int(text, base) except ValueError: - raise ScannerError(tag, source_text[start_shift:].rstrip("\n")) + raise SyntaxError(tag, source_text[start_shift:].rstrip("\n")) return chr(result) diff --git a/mathics_scanner/mathics3_tokens.py b/mathics_scanner/mathics3_tokens.py index 8403e95e..d6b0467a 100644 --- a/mathics_scanner/mathics3_tokens.py +++ b/mathics_scanner/mathics3_tokens.py @@ -10,7 +10,7 @@ from mathics_scanner.errors import ( EscapeSyntaxError, NamedCharacterSyntaxError, - ScannerError, + SyntaxError, ) from mathics_scanner.feed import FileLineFeeder, LineFeeder, SingleLineFeeder from mathics_scanner.tokeniser import Tokeniser @@ -178,8 +178,8 @@ def interactive_eval_loop(shell: TerminalShell, code_tokenize_format: bool): ) # This has to come after NamedCharacterSyntaxError and # EscapeSyntaxError since those are subclasses of - # ScannerError - except ScannerError: + # SyntaxError + except SyntaxError: shell.errmsg( "Syntax", "sntxi", @@ -204,7 +204,7 @@ def tokens(code, code_tokenize_format: bool): while True: try: token = tokeniser.next() - except ScannerError as scan_error: + except SyntaxError as scan_error: mess = "" if scan_error.tag == "sntoct1": mess = r"3 octal digits are required after \ to construct an 8-bit character" diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index f84b9cb8..9d508aef 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -16,7 +16,7 @@ IncompleteSyntaxError, InvalidSyntaxError, NamedCharacterSyntaxError, - ScannerError, + SyntaxError, ) from mathics_scanner.escape_sequences import parse_escape_sequence @@ -614,7 +614,7 @@ def next(self) -> Token: # No matching pattern found. if pattern_match is None: tag, pre_str, post_str = self.sntx_message() - raise ScannerError(tag, pre_str, post_str) + raise SyntaxError(tag, pre_str, post_str) # Look for custom tokenization rules; those are defined with t_tag. override = getattr(self, "t_" + tag, None) @@ -792,7 +792,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: # No matching found. if pattern_match is None: tag, pre, post = self.sntx_message() - raise ScannerError(tag, pre, post) + raise SyntaxError(tag, pre, post) text = pattern_match.group(0) start_pos = pattern_match.start(0) diff --git a/test/test_escape_sequences.py b/test/test_escape_sequences.py index c164b60f..5c1f6f9d 100644 --- a/test/test_escape_sequences.py +++ b/test/test_escape_sequences.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import pytest -from mathics_scanner.errors import NamedCharacterSyntaxError, ScannerError +from mathics_scanner.errors import NamedCharacterSyntaxError, SyntaxError from mathics_scanner.escape_sequences import parse_escape_sequence @@ -66,5 +66,5 @@ def test_invalid_number_encoding(): ":A1g9", ":01-2", ): - with pytest.raises(ScannerError): + with pytest.raises(SyntaxError): parse_escape_sequence(text, 0) diff --git a/test/test_tokeniser.py b/test/test_tokeniser.py index f118d2bb..11a6de06 100644 --- a/test/test_tokeniser.py +++ b/test/test_tokeniser.py @@ -12,7 +12,7 @@ from mathics_scanner.errors import ( IncompleteSyntaxError, InvalidSyntaxError, - ScannerError, + SyntaxError, ) from mathics_scanner.feed import SingleLineFeeder from mathics_scanner.tokeniser import Token, Tokeniser, is_symbol_name @@ -39,7 +39,7 @@ def invalid_error(error_message: str): def scanner_error(error_message): - with pytest.raises(ScannerError): + with pytest.raises(SyntaxError): tokens(error_message) From 5fce8a0237b57868e6e88c20281c6ab07f45f125 Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 29 May 2025 18:32:07 -0400 Subject: [PATCH 26/34] More tests --- test/test_tokeniser.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/test/test_tokeniser.py b/test/test_tokeniser.py index 11a6de06..52537190 100644 --- a/test/test_tokeniser.py +++ b/test/test_tokeniser.py @@ -10,6 +10,7 @@ import pytest from mathics_scanner.errors import ( + EscapeSyntaxError, IncompleteSyntaxError, InvalidSyntaxError, SyntaxError, @@ -28,6 +29,11 @@ def check_symbol(source_code: str): assert token, Token("Symbol", source_code, 0) +def escape_syntax_error(error_message: str): + with pytest.raises(EscapeSyntaxError): + tokens(error_message) + + def incomplete_error(error_message: str): with pytest.raises(IncompleteSyntaxError): tokens(error_message) @@ -94,12 +100,8 @@ def test_association(): ] -@pytest.mark.skip("Backslash needs to be hanndled outside of prescanner") def test_backslash(): - assert tokens("\\[Backslash]") == [Token("Backslash", "\u2216", 0)] - - assert tokens("\\ a") == [Token("RawBackslash", "\\", 0), Token("Symbol", "a", 2)] - + assert tokens(r"\[Backslash]") == [Token("Backslash", "\u2216", 0)] incomplete_error("\\") From a568063244af5b088a5c8a008fb802a27d677aba Mon Sep 17 00:00:00 2001 From: rocky Date: Thu, 29 May 2025 21:57:00 -0400 Subject: [PATCH 27/34] One more escape test --- test/test_escape_sequences.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/test_escape_sequences.py b/test/test_escape_sequences.py index 5c1f6f9d..2c0726c8 100644 --- a/test/test_escape_sequences.py +++ b/test/test_escape_sequences.py @@ -11,6 +11,14 @@ def test_escape_sequences(): ("\\\\", 0, 1, "\\", "backslash"), ("abc \\\\", 5, 6, "\\", "backslash at end"), ("abc \\\\n", 5, 6, "\\", "backslash in middle"), + ( + r"\ +abc", + 1, + 2, + "\n", + "backslashed at end of line", + ), # Octal (r"050", 0, 3, chr(0o50), "character at beginning"), (r"a\051", 2, 5, chr(0o51), "Octal character in middle"), From 36d85a7a4b3ce7b5ecf102e62c62d60882d16cd8 Mon Sep 17 00:00:00 2001 From: rocky Date: Sat, 31 May 2025 07:57:49 -0400 Subject: [PATCH 28/34] Allow escape space "\ " + more string tests --- mathics_scanner/escape_sequences.py | 4 +- test/test_string_tokens.py | 68 ++++++++++++++++++----------- 2 files changed, 46 insertions(+), 26 deletions(-) diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py index 44295027..00b2cb1d 100644 --- a/mathics_scanner/escape_sequences.py +++ b/mathics_scanner/escape_sequences.py @@ -122,9 +122,11 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]: # Note that these are a similer to Python, but are different. # In particular, Python defines "\a" to be ^G (control G), # but in WMA, this is invalid. - elif c in "ntbfr$\n": + elif c in "ntbfr $\n": if c in "n\n": result += "\n" + elif c == " ": + result += " " elif c == "t": result += "\t" elif c == "b": diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py index e42a5bee..6686ec0c 100644 --- a/test/test_string_tokens.py +++ b/test/test_string_tokens.py @@ -12,10 +12,19 @@ from mathics_scanner.tokeniser import Token, Tokeniser -def check_string(source_text, expected_text: str, message: Optional[str] = ""): +def check_string( + source_text, + expected_text: str, + message: Optional[str] = "", + expected_tag: Optional[str] = None, +): token = single_token(source_text) assert token is not None - assert token.tag == "String" + + if expected_tag is None: + expected_tag = "String" + assert token.tag == expected_tag + if message: assert token.text == expected_text, message else: @@ -36,7 +45,7 @@ def escape_scan_error(s: str, failure_msg: str): assert excinfo, failure_msg -def single_token(source_text) -> Token: +def single_token(source_text: str) -> Token: tokens = get_tokens(source_text) assert len(tokens) == 1 token = tokens[0] @@ -56,23 +65,24 @@ def get_tokens(source_text: str): def test_string(): - # # Number conversions for binary, octal, hexadecimal - # check_string(r'"\\c"', '"\\c"', "escaped backslash at beginning of string") - # check_string(r'"a\\b"', r'"a\b"', "escaped backslash") - # check_string(r'"\102"', '"B"', "Octal number test") - # check_string(r'"q\.b4"', '"q´"') - - # # All valid ASCII-like control escape sequences - # for escape_string in ("\b", "\f", "\n", "\r", "\t"): - # check_string(f'"a{escape_string}"', f'"a{escape_string}"') - - # check_string(r'"abc"', r'"abc"') - # check_string(r'"abc(*def*)"', r'"abc(*def*)"') - # # check_string(r'"a\"b\\c"', r'"a\\"b\c"') - # incomplete_error(r'"abc', "String does not have terminating quote") - # incomplete_error(r'"\"', "Unterminated escape sequence") - escape_scan_error(r'"a\g"', "Unknown string escape \\g") + # Number conversions for binary, octal, hexadecimal + check_string(r'"a\\b"', r'"a\b"', "escaped backslash in a string") + check_string(r'"\102"', '"B"', "Octal number test in a string") + check_string(r'"q\.b4"', '"q´"', "2-digit hexadecimal number in a string") + + check_string(r'"\\c"', '"\\c"', "escaped backslash at beginning of string") + + # All valid ASCII-like control escape sequences + for escape_string in ("\b", "\f", "\n", "\r", "\t"): + check_string(f'"a{escape_string}"', f'"a{escape_string}"') + + check_string(r'"\ abc"', '" abc"', "Escaped space in a string is valid") + check_string(r'"abc(*def*)"', r'"abc(*def*)"') + # check_string(r'"a\"b\\c"', r'"a\\"b\c"') + incomplete_error(r'"abc', "String does not have terminating quote") + incomplete_error(r'"\"', "Unterminated escape sequence") + escape_scan_error(r'"a\g"', "Unknown string escape \\g") escape_scan_error(r'"a\X"', '"X" is not a valid escape character') @@ -84,13 +94,17 @@ def test_octal(): check_string(r'"a\050"', r'"a("', "Octal '(' in string") check_string(r'"a\051"', r'"a)"', "Octal ')' in string") check_string(r'"a\052"', r'"a*"', "Octal '*' in string") - # FIXME: add tests ouside of string def test_hexadecimal_dot(): check_string(r'"\.30"', '"0"', "2-digit hexadecimal ASCII number 0") check_string(r'"\.42"', '"B"', "2-digit hexadecimal ASCII capital B") - # FIXME: add tests ouside of string + check_string( + r"\.42\.30", + "B0", + "hexademimal encoding of identifier in expression context", + "Symbol", + ) def test_hexadecimal_colon(): @@ -101,13 +115,17 @@ def test_hexadecimal_colon(): ) check_string( r'"\:03b8"', - '"\u03B8"', + '"\u03b8"', "4-digit hexadecimal number test with lowercase alpha lettter", ) check_string(r'"\:0030"', '"0"') - # FIXME: - # check_string(r"\:03b8", "\u03B8", "4-digit hexadecimal number test with lowercase alpha lettter") + check_string( + r"\:03b8", + "\u03b8", + "4-digit hexadecimal number test with lowercase alpha letter", + "Symbol", + ) def test_hexadecimal_vbar(): - check_string(r'"\|01D451"', '"\U0001D451"') + check_string(r'"\|01D451"', '"\U0001d451"') From 00cbb48930dc0bc338cfe16f2c18a8c19716bb26 Mon Sep 17 00:00:00 2001 From: rocky Date: Sat, 31 May 2025 09:31:06 -0400 Subject: [PATCH 29/34] Start unit test for comments --- test/test_string_tokens.py | 4 +++ test/test_tokeniser.py | 50 +++++++++++++++++++++++++------------- 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py index 6686ec0c..3e0a6de9 100644 --- a/test/test_string_tokens.py +++ b/test/test_string_tokens.py @@ -65,6 +65,10 @@ def get_tokens(source_text: str): def test_string(): + # Plain strings + check_string('""', '""', "Null string") + check_string('"abc"', '"abc"', "Simple sequence") + # Number conversions for binary, octal, hexadecimal check_string(r'"a\\b"', r'"a\b"', "escaped backslash in a string") check_string(r'"\102"', '"B"', "Octal number test in a string") diff --git a/test/test_tokeniser.py b/test/test_tokeniser.py index 52537190..46a96cb6 100644 --- a/test/test_tokeniser.py +++ b/test/test_tokeniser.py @@ -72,6 +72,13 @@ def tokens(source_code) -> List[Token]: return tokens +def test_accuracy(): + scanner_error("1.5``") + check_number("1.0``20") + check_number("1.0``0") + check_number("1.4``-20") + + def test_apply(): assert tokens("f // x") == [ Token("Symbol", "f", 0), @@ -113,6 +120,30 @@ def test_boxes(): ] +def test_comments(): + assert tokens("(**)") == [], "empty comment" + assert tokens("(**)1") == [ + Token("Number", "1", 4) + ], "empty comment with trailing text" + assert tokens("1(*2*)") == [ + Token("Number", "1", 0) + ], "empty comment with leading text" + assert tokens("1 (*2*)") == [ + Token("Number", "1", 0) + ], "empty comment with leading text and space" + assert tokens("(* A (* nested comment *) *)") == [], "A nested comment" + assert tokens(r"(* A \[theta] *)") == [], "Comment with valid escape sequence" + assert tokens(r"(* A \[unknown] *)") == [], "Comment with invalid escape sequence" + + +def test_function(): + assert tokens("x&") == [Token("Symbol", "x", 0), Token("Function", "&", 1)] + assert tokens("x\uf4a1") == [ + Token("Symbol", "x", 0), + Token("Function", "\uf4a1", 1), + ] + + def test_information(): assert tokens("??Sin") == [Token("Information", "??", 0), Token("Symbol", "Sin", 2)] @@ -129,8 +160,8 @@ def test_int_repeated(): def test_integeral(): - assert tokens("\u222B x \uf74c y") == [ - Token("Integral", "\u222B", 0), + assert tokens("\u222b x \uf74c y") == [ + Token("Integral", "\u222b", 0), Token("Symbol", "x", 2), Token("DifferentialD", "\uf74c", 4), Token("Symbol", "y", 6), @@ -142,13 +173,6 @@ def test_is_symbol(): assert not is_symbol_name("98") # symbols can't start with numbers -def test_accuracy(): - scanner_error("1.5``") - check_number("1.0``20") - check_number("1.0``0") - check_number("1.4``-20") - - def test_number(): assert tags("1.5") == ["Number"] assert tags("1.5*^10") == ["Number"] @@ -227,11 +251,3 @@ def test_unset(): assert tokens("= .") == [Token("Unset", "= .", 0)] assert tokens("=.5") == [Token("Set", "=", 0), Token("Number", ".5", 1)] assert tokens("= ..") == [Token("Set", "=", 0), Token("Repeated", "..", 2)] - - -def test_function(): - assert tokens("x&") == [Token("Symbol", "x", 0), Token("Function", "&", 1)] - assert tokens("x\uf4a1") == [ - Token("Symbol", "x", 0), - Token("Function", "\uf4a1", 1), - ] From 2422c6076e75f6d2c405d409277a34d0f1dbf8e4 Mon Sep 17 00:00:00 2001 From: rocky Date: Sat, 31 May 2025 11:31:27 -0400 Subject: [PATCH 30/34] Fix a doc spelling typo + minor doc tweak --- mathics_scanner/escape_sequences.py | 2 +- mathics_scanner/tokeniser.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py index 00b2cb1d..633b9ec9 100644 --- a/mathics_scanner/escape_sequences.py +++ b/mathics_scanner/escape_sequences.py @@ -57,7 +57,7 @@ def parse_named_character(source_text: str, start: int, finish: int) -> Optional e.g. "Theta". If we can match this, then we return the unicode equivalent from the `named_characters` map (which is read in from JSON but stored in a YAML file). - If we can't find the named character, rasie NamedCharacterSyntaxError. + If we can't find the named character, raise NamedCharacterSyntaxError. """ named_character = source_text[start:finish] if named_character.isalpha(): diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 9d508aef..54f58cbe 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -459,8 +459,8 @@ def is_symbol_name(text: str) -> bool: class Token: """A representation of a Wolfram-Language token. - A Token is the next level of parsing abstraction above a raw input - Mathics input string. A sequence of tokens is the input for the + A Token is the next level of parsing abstraction above a raw + Mathics3 input string. A sequence of tokens is the input for the Mathics3 parser. A token has a `tag`, the class or type of the token. For example: @@ -469,8 +469,7 @@ class Token: The token's `text` is the string contents of the token. The token's `pos` is the integer starting offset where - `text` can be found inside the input string. The input string - is not part of the token though. + `text` can be found inside the full input string. """ def __init__(self, tag: str, text: str, pos: int): From 7582e6b0685f41a0c2bed596130ccd2255cfe2b8 Mon Sep 17 00:00:00 2001 From: rocky Date: Sun, 1 Jun 2025 07:08:52 -0400 Subject: [PATCH 31/34] invalid escape sequences inside strings... An invalid escape sequence inside a string, like "\(a \+\)" is not an error. Instead the sequence the same, e.g "\(a \+\)". --- mathics_scanner/tokeniser.py | 13 +++++++++++-- test/test_string_tokens.py | 12 +++++++++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 54f58cbe..6c94c140 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -891,13 +891,22 @@ def t_String(self, _: Optional[re.Match]) -> Token: self.pos += 1 try: escape_str, self.pos = parse_escape_sequence(source_text, self.pos) - except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error: + except NamedCharacterSyntaxError as escape_error: self.feeder.message( escape_error.name, escape_error.tag, *escape_error.args ) raise - result += escape_str + # This has to come after NamedCharacterSyntaxError since + # that is a subclass of this + except EscapeSyntaxError: + # If there is an invalid escape character inside a string, + # we preserve what was given. + result += "\\" + self.source_text[self.pos] + self.pos += 1 + + else: + result += escape_str else: result += self.source_text[self.pos] self.pos += 1 diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py index 3e0a6de9..c9132a7e 100644 --- a/test/test_string_tokens.py +++ b/test/test_string_tokens.py @@ -82,12 +82,18 @@ def test_string(): check_string(r'"\ abc"', '" abc"', "Escaped space in a string is valid") check_string(r'"abc(*def*)"', r'"abc(*def*)"') - # check_string(r'"a\"b\\c"', r'"a\\"b\c"') + + check_string( + r'"\(a \+\)"', + r'"\(a \+\)"', + "Do not interpret, but preserve boxing inside a string", + ) + incomplete_error(r'"abc', "String does not have terminating quote") incomplete_error(r'"\"', "Unterminated escape sequence") - escape_scan_error(r'"a\g"', "Unknown string escape \\g") - escape_scan_error(r'"a\X"', '"X" is not a valid escape character') + # escape_scan_error(r'"a\g"', "Unknown string escape \\g") + # escape_scan_error(r'"a\X"', '"X" is not a valid escape character') # https://www.wolfram.com/language/12/networking-and-system-operations/use-the-full-range-of-unicode-characters.html From a49e453398bd6b6985211e042afc94f065230df1 Mon Sep 17 00:00:00 2001 From: rocky Date: Sun, 1 Jun 2025 09:15:13 -0400 Subject: [PATCH 32/34] Escape sequences in strings, yet again... If the escape sequenced in a string can be a boxing construct, then this is not an error in the escape sequence. Otherwise, it is. For example "\(" is not an error in a string while "\g" is. Yes, this a bit involved. But that's the way WA works. --- mathics_scanner/tokeniser.py | 65 ++++++++++++++++++++++++++++-------- test/test_string_tokens.py | 4 +-- 2 files changed, 54 insertions(+), 15 deletions(-) diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 6c94c140..a3198ad8 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -8,7 +8,7 @@ import os.path as osp import re import string -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Set, Tuple from mathics_scanner.characters import _letterlikes, _letters from mathics_scanner.errors import ( @@ -25,17 +25,38 @@ except ImportError: import json as ujson # type: ignore[no-redef] - -OPERATOR_DATA = {} +# Where we get operator data... ROOT_DIR = osp.dirname(__file__) OPERATORS_TABLE_PATH = osp.join(ROOT_DIR, "data", "operators.json") + +############################################## +# The below get initialized in by init_module() +# from operator data +############################################## +OPERATOR_DATA = {} NO_MEANING_OPERATORS = {} +# String of the final character of a "box-operators" value, +# This is used in t_String for escape-sequence handling. +BOXING_CONSTRUCT_SUFFIXES: Set[str] = { + "%", + "/", + "@", + "+", + "_", + "&", + "!", + "^", + "`", + "(", + ")", +} + FILENAME_TOKENS: List = [] TOKENS: List[Tuple] = [] TOKEN_INDICES: Dict = {} - +############################################## # special patterns NUMBER_PATTERN = r""" ( (?# Two possible forms depending on whether base is specified) @@ -62,7 +83,7 @@ # # This could still be done, but it would need to be integrated more # properly into the tokenization phase which takes into account -# differents states or "modes" indicating the interior of comments, +# different states or "modes" indicating the interior of comments, # strings, files, and Box-like constructs. # The leading character of a Symbol: @@ -172,6 +193,12 @@ def init_module(): with open(osp.join(OPERATORS_TABLE_PATH), "r", encoding="utf8") as operator_f: OPERATOR_DATA.update(ujson.load(operator_f)) + global BOXING_CONSTRUCT_SUFFIXES + + BOXING_CONSTRUCT_SUFFIXES = set( + [op_str[-1] for op_str in OPERATOR_DATA["box-operators"].values()] + ) | set([")", "("]) + global NO_MEANING_OPERATORS NO_MEANING_OPERATORS = ( set(OPERATOR_DATA["no-meaning-infix-operators"].keys()) @@ -853,7 +880,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token: def t_String(self, _: Optional[re.Match]) -> Token: """Break out from self.source_text the next token which is expected to be a String. The string value of the returned token will have double quote (") in the first and last - postions of the returned string. + positions of the returned string. """ end = None self.pos += 1 # skip opening '"' @@ -866,7 +893,7 @@ def t_String(self, _: Optional[re.Match]) -> Token: # ("). Also, after parsing escape sequences, we can # unconditionally add them on to the string. That is, we # don't have to check whether the returned string can be valid - # in a Symbol name. + # in a Symbol name or as a boxing construct while True: if self.pos >= len(source_text): @@ -898,12 +925,24 @@ def t_String(self, _: Optional[re.Match]) -> Token: raise # This has to come after NamedCharacterSyntaxError since - # that is a subclass of this - except EscapeSyntaxError: - # If there is an invalid escape character inside a string, - # we preserve what was given. - result += "\\" + self.source_text[self.pos] - self.pos += 1 + # that is a subclass of this. + except EscapeSyntaxError as escape_error: + escaped_char = self.source_text[self.pos] + if escaped_char in BOXING_CONSTRUCT_SUFFIXES: + # If there is boxing construct matched, we + # preserve what was given, but do not tokenize + # the construct. "\(" remains "\(" and is not + # turned into IntepretBox". + result += "\\" + escaped_char + self.pos += 1 + else: + # Not something that can be a boxing construct. + # So here, we'll report an error as we do with + # NamedCharacterSyntaxError. + self.feeder.message( + escape_error.name, escape_error.tag, *escape_error.args + ) + raise else: result += escape_str diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py index c9132a7e..193be610 100644 --- a/test/test_string_tokens.py +++ b/test/test_string_tokens.py @@ -92,8 +92,8 @@ def test_string(): incomplete_error(r'"abc', "String does not have terminating quote") incomplete_error(r'"\"', "Unterminated escape sequence") - # escape_scan_error(r'"a\g"', "Unknown string escape \\g") - # escape_scan_error(r'"a\X"', '"X" is not a valid escape character') + escape_scan_error(r'"a\g"', "Unknown string escape \\g") + escape_scan_error(r'"a\X"', '"X" is not a valid escape character') # https://www.wolfram.com/language/12/networking-and-system-operations/use-the-full-range-of-unicode-characters.html From 1d10b185edb6a59909f8af71ed884f212865f19d Mon Sep 17 00:00:00 2001 From: rocky Date: Sun, 1 Jun 2025 10:24:43 -0400 Subject: [PATCH 33/34] Add LineSeparator, and \* Also, flatten values in box operators for BOXING_CONSTRUCT_SUFFIXES --- mathics_scanner/data/named-characters.yml | 12 +++++++++++- mathics_scanner/tokeniser.py | 13 +++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/mathics_scanner/data/named-characters.yml b/mathics_scanner/data/named-characters.yml index bcca13c2..e94d6c02 100644 --- a/mathics_scanner/data/named-characters.yml +++ b/mathics_scanner/data/named-characters.yml @@ -68,7 +68,7 @@ # the named character. If it is the same as unicode-equivalent # it should be omitted # -# wl-unicode-name: The name of the character corresponding to `wl-unicode`, if it exists. If it is the same as unicode-equivalent-name it can be omitted. +# wl-unicode-name: The name of the character corresponding to `wl-unicode`, if it exists. # It will mentioned in Wolfram Language docs if it exists. # # Sources: @@ -6628,6 +6628,16 @@ LightBulb: wl-reference: https://reference.wolfram.com/language/ref/character/LightBulb.html wl-unicode: "\uF723" +LineSeparator: + has-unicode-inverse: false + is-letter-like: false + unicode-equivalent: "\u2028" + unicode-equivalent-name: LINE SEPARATOR + unicode-reference: https://www.compart.com/en/unicode/U+2028 + wl-reference: https://reference.wolfram.com/language/ref/character/LineSeparator.html + wl-unicode: "\u2028" + wl-unicode-name: LINE SEPARATOR + LongDash: esc-alias: -- has-unicode-inverse: false diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index a3198ad8..1698ad4c 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -5,6 +5,7 @@ See classes `Token` and `Tokeniser` . """ +import itertools import os.path as osp import re import string @@ -38,6 +39,8 @@ # String of the final character of a "box-operators" value, # This is used in t_String for escape-sequence handling. +# The below is roughly correct, but we overwrite this +# from operators.json data in init_module() BOXING_CONSTRUCT_SUFFIXES: Set[str] = { "%", "/", @@ -48,6 +51,7 @@ "!", "^", "`", + "*", "(", ")", } @@ -196,8 +200,13 @@ def init_module(): global BOXING_CONSTRUCT_SUFFIXES BOXING_CONSTRUCT_SUFFIXES = set( - [op_str[-1] for op_str in OPERATOR_DATA["box-operators"].values()] - ) | set([")", "("]) + [ + op_str[-1] + for op_str in itertools.chain.from_iterable( + OPERATOR_DATA["box-operators"].values() + ) + ] + ) | set(["*", ")", "("]) global NO_MEANING_OPERATORS NO_MEANING_OPERATORS = ( From 0f0418d32d7ba4966ec7025ebd87281a133e470d Mon Sep 17 00:00:00 2001 From: rocky Date: Tue, 3 Jun 2025 08:51:00 -0400 Subject: [PATCH 34/34] Remove duplicate test --- test/test_string_tokens.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py index 193be610..84660148 100644 --- a/test/test_string_tokens.py +++ b/test/test_string_tokens.py @@ -123,11 +123,6 @@ def test_hexadecimal_colon(): '"θ"', "4-digit hexadecimal number test with uppercase alpha letter", ) - check_string( - r'"\:03b8"', - '"\u03b8"', - "4-digit hexadecimal number test with lowercase alpha lettter", - ) check_string(r'"\:0030"', '"0"') check_string( r"\:03b8",