From 44c6e058308c1e28eb0c7c88322004bc19a0cf74 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Fri, 11 Apr 2025 15:36:18 -0400
Subject: [PATCH 01/34] Reduce prescanner use

Escape sequences other than named characters have been removed from the
prescanner and put in the scanner.
---
 mathics_scanner/prescanner.py | 206 ----------------------------------
 mathics_scanner/tokeniser.py  | 118 ++++++++++++++++++-
 test/test_string_tokens.py    |  33 ++++--
 3 files changed, 140 insertions(+), 217 deletions(-)
 delete mode 100644 mathics_scanner/prescanner.py

diff --git a/mathics_scanner/prescanner.py b/mathics_scanner/prescanner.py
deleted file mode 100644
index 08f56346..00000000
--- a/mathics_scanner/prescanner.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Note: this module will be remove or rewritten drastically in the near future.
-"""
-Module for "prescanning". Right now this just means replacing
-character escape sequences.
-"""
-
-from typing import List
-
-from mathics_scanner.characters import named_characters
-from mathics_scanner.errors import IncompleteSyntaxError, ScanError
-from mathics_scanner.feed import LineFeeder
-
-
-class Prescanner(object):
-    r"""
-    A Class for converting escape sequences:
-        Character codes to characters:
-            \.7A -> z
-            \.004a -> J
-            \:004a -> J
-            \|01D451  -> \U0001D451
-            \041 -> !
-        Named Characters to Unicode:
-            \[Theta] -> \u03B8
-        ASCII escape sequence:
-            \n -> literal \n
-
-    Trailing backslash characters (\) are reported incomplete.
-    """
-
-    def __init__(self, feeder: LineFeeder):
-        # self.feeder is a function that returns the next line of the Mathics input
-        self.feeder = feeder
-
-        # self.input_line is the result of reading the next Mathics input line
-        self.input_line: str = feeder.feed()
-
-        # self.pos is current position within self.input_line.
-        self.pos = 0
-
-    def feed(self) -> str:
-        """
-        Return the next line of Mathics input
-        """
-        return self.feeder.feed()
-
-    def get_more_input(self):
-        "Get another source-text line from input and continue."
-
-        line: str = self.feed()
-        if not line:
-            text = self.input_line[self.pos :].rstrip()
-            self.feeder.message("Syntax", "sntxi", text)
-            raise IncompleteSyntaxError("Syntax", "sntxi", text)
-        self.input_line += line
-
-    def replace_escape_sequences(self) -> str:
-        """
-        Replace escape sequences in ``self.input_line``. The replacement string is returned.
-        Note: ``self.input_line`` is not modified.
-        """
-
-        # Line fragments to be joined before returning from this method.
-        line_fragments: List[str] = []
-
-        # Fragment start position of line fragment under consideration.
-        self.fragment_start = self.pos
-
-        def start_new_fragment(pos: int) -> None:
-            """
-            Update position markers to start a new line fragment at ``pos``.
-            """
-            self.pos = pos
-            self.fragment_start = pos
-
-        def try_parse_base(start_shift: int, end_shift: int, base: int) -> None:
-            r"""
-            See if characters self.pos+start_shift .. self.pos+end shift
-            can be converted to an integer in base  ``base``.
-
-            If so, we append the characters before the escape sequence without the
-            escaping characters like ``\.`` or ``\:``.
-
-            We also append the converted integer to ``line_fragments``, and update
-            position cursors for a new line fragment.
-
-            However, if the conversion fails, then error messages are
-            issued and nothing is updated
-            """
-            start, end = self.pos + start_shift, self.pos + end_shift
-            result = None
-            if end <= len(self.input_line):
-                text = self.input_line[start:end]
-                try:
-                    result = int(text, base)
-                except ValueError:
-                    pass  # result remains None
-            if result is None:
-                last = end - start
-                if last == 2:
-                    self.feeder.message("Syntax", "sntoct2")
-                elif last == 3:
-                    self.feeder.message("Syntax", "sntoct1")
-                elif last == 4:
-                    self.feeder.message("Syntax", "snthex")
-                else:
-                    raise ValueError()
-                self.feeder.message(
-                    "Syntax", "sntxb", self.input_line[self.pos :].rstrip("\n")
-                )
-                raise ScanError("Syntax", "sntxb")
-
-            # Add text from prior line fragment as well
-            # as the escape sequence, a character, from the escape sequence
-            # that was just matched.
-            line_fragments.append(self.input_line[start : self.pos])
-            line_fragments.append(chr(result))
-
-            # Set up a new line fragment for the next time we are called.
-            start_new_fragment(end)
-
-        def try_parse_named_character(start_shift: int):
-            r"""Before calling we have matched "\[".  Scan to the remaining "]" and
-            try to match what is found in-between with a known named
-            character, e.g. "Theta".  If we can match this, we store
-            the unicode character equivalent in ``line_fragments``.
-            If we can't find a named character, error messages are
-            issued and we leave ``line_fragments`` untouched.
-            """
-            i = self.pos + start_shift
-            while True:
-                if i == len(self.input_line):
-                    self.get_more_input()
-                if self.input_line[i] == "]":
-                    break
-                i += 1
-
-            named_character = self.input_line[self.pos + start_shift : i]
-            if named_character.isalpha():
-                char = named_characters.get(named_character)
-                if char is None:
-                    self.feeder.message("Syntax", "sntufn", named_character)
-                    # stay in same line fragment
-                else:
-                    # Add text from prior line fragment as well
-                    # as the escape sequence, a character, from the escape sequence
-                    # just matched.
-                    line_fragments.append(
-                        self.input_line[self.fragment_start : self.pos]
-                    )
-                    line_fragments.append(char)
-                    start_new_fragment(i + 1)
-
-            # Stay in same line fragment, but advance the cursor position.
-            self.pos = i + 1
-
-        # In the following loop, we look for and replace escape
-        # sequences. The current character under consideration is at
-        # self.code[self.pos].  When an escape sequence is found at
-        # that position, the previous line_fragment is extracted and
-        # stored in ``line_fragments``. The start-position marker for the
-        # next line_fragment is started and self.pos is updated.
-
-        while self.pos < len(self.input_line):
-            if self.input_line[self.pos] == "\\":
-                # Look for and handle an escape sequence.
-                if self.pos + 1 == len(self.input_line):
-                    self.get_more_input()
-                c = self.input_line[self.pos + 1]
-                if c == "|":
-                    try_parse_base(2, 8, 16)
-                if c == ".":
-                    # See if we have a two-digit hexadecimal number.
-                    try_parse_base(2, 4, 16)
-                elif c == ":":
-                    # See if we have a four-digit hexadecimal number.
-                    try_parse_base(2, 6, 16)
-                elif c == "[":
-                    try_parse_named_character(2)
-                elif c in "01234567":
-                    # See if we have an octal number.
-                    try_parse_base(1, 4, 8)
-                elif c == "\n":
-                    if self.pos + 2 == len(self.input_line):
-                        self.get_more_input()
-                    line_fragments.append(
-                        self.input_line[self.fragment_start : self.pos]
-                    )
-                    start_new_fragment(self.pos + 2)
-                else:
-                    # Two backslashes in succession indicates a single
-                    # backslash character.  Advance the scanning
-                    # cursor (self.pos) over both backslashes.  Also,
-                    # Python's backslash escape mechanism turns the
-                    # two backslashes into one in length calculations.
-                    self.pos += 2
-            else:
-                self.pos += 1
-
-        # Add the final line fragment.
-        line_fragments.append(self.input_line[self.fragment_start :])
-
-        # Produce and return the input line with escape-sequences replaced
-        return "".join(line_fragments)
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 06ade1fc..d1bf8f72 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -517,6 +517,8 @@ def __init__(self, feeder):
         )
         self.pos: int = 0
         self.feeder = feeder
+
+        # FIXME: remove this
         self.prescanner = Prescanner(feeder)
         self.source_text = self.prescanner.replace_escape_sequences()
         self.mode: str = "invalid"
@@ -613,6 +615,56 @@ def next(self) -> Token:
         self.pos = pattern_match.end(0)
         return Token(tag, text, pattern_match.start(0))
 
+    def try_parse_base(self, start_shift: int, end_shift: int, base: int) -> str:
+        r"""
+        See if characters self.pos+start_shift .. self.pos+end shift
+        can be converted to an integer in base  ``base``.
+
+        If so, chr(integer value converted from base).
+
+        However, if the conversion fails, then error messages are
+        issued and nothing is updated
+        """
+        start, end = self.pos + start_shift, self.pos + end_shift
+        result = None
+        if end <= len(self.source_text):
+            text = self.source_text[start:end]
+            try:
+                result = int(text, base)
+            except ValueError:
+                pass  # result remains None
+        if result is None:
+            last = end - start
+            if last == 2:
+                self.feeder.message("Syntax", "sntoct2")
+            elif last == 3:
+                self.feeder.message("Syntax", "sntoct1")
+            elif last == 4:
+                self.feeder.message("Syntax", "snthex")
+            else:
+                raise ValueError()
+            error_text = self.source_text[self.pos :].rstrip("\n")
+            self.feeder.message("Syntax", "sntxb", error_text)
+            raise ScanError("syntx", error_text)
+
+        return chr(result)
+
+    def try_parse_named_character(self, start_shift: int) -> Optional[str]:
+        r"""Before calling we have matched "\[".  Scan to the remaining "]" and
+        try to match what is found in-between with a known named
+        character, e.g. "Theta".  If we can match this, we store
+        the unicode character equivalent in ``line_fragments``.
+        If we can't find a named character, error messages are
+        issued and we leave ``line_fragments`` untouched.
+        """
+        named_character = self.source_text[self.pos + start_shift : self.pos + start_shift]
+        if named_character.isalpha():
+            char = named_characters.get(named_character)
+            if char is None:
+                self.feeder.message("Syntax", "sntufn", named_character)
+            else:
+                return named_character
+
     def _skip_blank(self):
         "Skip whitespace and comments"
         comment = []  # start positions of comments
@@ -703,23 +755,85 @@ def t_String(self, _: re.Match) -> Token:
         start, end = self.pos, None
         self.pos += 1  # skip opening '"'
         newlines = []
+        source_text = self.source_text
+        result = ""
         while True:
             if self.pos >= len(self.source_text):
                 if end is None:
                     # reached end while still inside string
                     self.get_more_input()
                     newlines.append(self.pos)
+                    source_text = self.source_text
                 else:
                     break
-            char = self.source_text[self.pos]
+            char = source_text[self.pos]
             if char == '"':
                 self.pos += 1
                 end = self.pos
                 break
 
             if char == "\\":
-                self.pos += 2
+                if self.pos + 1 == len(source_text):
+                    # We have reached end of the input line before seeing a terminating
+                    # quote ("). Fetch aanother line.
+                    self.get_more_input()
+                self.pos += 1
+                c = source_text[self.pos]
+                if c == "\\":
+                    result += "\\"
+                    self.pos += 1
+                    continue
+                # https://www.wolfram.com/language/12/networking-and-system-operations/use-the-full-range-of-unicode-characters.html
+                # describes hex encoding.
+                elif c == ".":
+                    # See if we have a 2-digit hexadecimal number.
+                    # For example, \.42 is "B"
+                    result += self.try_parse_base(1, 3, 16)
+                    self.pos += 3
+                elif c == ":":
+                    # See if we have a 4-digit hexadecimal number.
+                    # For example, \:03B8" is Unicode small leter theta: θ.
+                    result += self.try_parse_base(1, 5, 16)
+                    self.pos += 5
+                elif c == "|":
+                    # See if we have a 6-digit hexadecimal number.
+                    result += self.try_parse_base(1, 7, 16)
+                elif c == "[":
+                    named_character = self.try_parse_named_character(2)
+                    if named_character is not None:
+                        result += named_character
+                        self.pos += 4  # ???
+                elif c in "01234567":
+                    # See if we have a 3-digit octal number.
+                    # For example \065 = "5"
+                    result += self.try_parse_base(0, 3, 8)
+                    self.pos += 3
+
+                # WMA escape characters \n, \t, \b, \r.
+                # Note that these are a similer to Python, but are different.
+                # In particular, Python defines "\a" to be ^G (control G),
+                # but in WMA, this is invalid.
+                elif c in "ntbfr":
+                    if c == "n":
+                        result += "\n"
+                    elif c == "t":
+                        result += "\t"
+                    elif c == "b":
+                        result += "\b"
+                    elif c == "f":
+                        result += "\f"
+                    else:
+                        assert c == "r"
+                        result += "\r"
+                    self.pos += 1
+                elif c in '!"':
+                    result += c
+                    self.pos += 1
+                else:
+                    self.sntx_invalid_esc_message(c)
+                    raise ScanError()
             else:
+                result += self.source_text[self.pos]
                 self.pos += 1
 
         indices = [start] + newlines + [end]
diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py
index 0cbd5b87..3715f41f 100644
--- a/test/test_string_tokens.py
+++ b/test/test_string_tokens.py
@@ -3,6 +3,8 @@
 Tests translation from text characters to the token: String
 """
 
+from typing import Optional
+
 import pytest
 
 from mathics_scanner.errors import IncompleteSyntaxError, ScanError
@@ -10,11 +12,14 @@
 from mathics_scanner.tokeniser import Token, Tokeniser
 
 
-def check_string(source_text, expected_text: str):
+def check_string(source_text, expected_text: str, message: Optional[str] = ""):
     token = single_token(source_text)
     assert token is not None
     assert token.tag == "String"
-    assert token.text == expected_text
+    if message:
+        assert token.text == expected_text, message
+    else:
+        assert token.text == expected_text
 
 
 def incomplete_error(s: str, failure_msg: str):
@@ -51,17 +56,27 @@ def get_tokens(source_text: str):
 
 
 def test_string():
+    # Number conversions for binary, octal, hexadecimal
+    check_string(r'"\\c"', '"\\c"', "escaped backslash at beginning of string")
+    check_string(r'"a\\b"', r'"a\b"', "escaped backslash")
+    check_string(r'"\:03B8"', '"θ"', "4-digit hexadecimal number test")
+    check_string(r'"\102"', '"B"', "Octal number test")
+    check_string(r'"\.42"', '"B"', "2-digit hexadecimal number test")
+    check_string(r'"q\.b4"', '"q´"')
+
+    # All valid ASCII-like control escape sequences
     for escape_string in ("\b", "\f", "\n", "\r", "\t"):
         check_string(f'"a{escape_string}"', f'"a{escape_string}"')
 
-    # Broken:
-    # "a\050", "a\051" "a\052"
-    # Prescanning eagerly replaces the escape sequences with
-    # symbols "(", ")", or "*" respectively and this messes up parsing
-    # somehow.
+    check_string(r'"a\050"', r'"a("', "Octal '(' in string")
+    check_string(r'"a\051"', r'"a)"', "Octal ')' in string")
+    check_string(r'"a\052"', r'"a*"', "Octal '*' in string")
+
     check_string(r'"abc"', r'"abc"')
     check_string(r'"abc(*def*)"', r'"abc(*def*)"')
-    check_string(r'"a\"b\\c"', r'"a\"b\\c"')
+    # check_string(r'"a\"b\\c"', r'"a\\"b\c"')
     incomplete_error(r'"abc', "String does not have terminating quote")
     incomplete_error(r'"\"', "Unterminated escape sequence")
-    # scan_error(r'"a\X"', '"X" is not a valid escape character')
+    scan_error(r'"a\g"', "Unknown string escape \\g")
+
+    scan_error(r'"a\X"', '"X" is not a valid escape character')

From ae4aa63b32e016d5f6c694ea0ef05b98e76f4a5e Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Sat, 12 Apr 2025 13:51:51 -0400
Subject: [PATCH 02/34] Test workarounds.. for now.

---
 mathics_scanner/tokeniser.py |  1 +
 test/test_prescanner.py      | 29 +++++++++---------------
 test/test_string_tokens.py   | 43 +++++++++++++++++++++++++++++++-----
 test/test_tokeniser.py       |  1 +
 4 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index d1bf8f72..24406d35 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -798,6 +798,7 @@ def t_String(self, _: re.Match) -> Token:
                 elif c == "|":
                     # See if we have a 6-digit hexadecimal number.
                     result += self.try_parse_base(1, 7, 16)
+                    self.pos += 7
                 elif c == "[":
                     named_character = self.try_parse_named_character(2)
                     if named_character is not None:
diff --git a/test/test_prescanner.py b/test/test_prescanner.py
index 2f153d09..25163d26 100644
--- a/test/test_prescanner.py
+++ b/test/test_prescanner.py
@@ -2,8 +2,8 @@
 import pytest
 
 from mathics_scanner import IncompleteSyntaxError, ScanError
-from mathics_scanner.prescanner import Prescanner
 from mathics_scanner.feed import SingleLineFeeder
+from mathics_scanner.prescanner import Prescanner
 
 
 def replace_escape_sequences(mathics_text: str):
@@ -29,6 +29,7 @@ def assert_equal_length(mathics_text: str, length):
     assert len(replace_escape_sequences(mathics_text)) == length
 
 
+@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner")
 def test_named_characters():
     assert_equal(r"\[Theta]", "\u03B8")
     assert_equal(r"\[CapitalPi]", "\u03A0")
@@ -40,6 +41,7 @@ def test_named_characters():
     assert_equal("abc\\\\", "abc\\\\")
 
 
+@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner")
 def test_text_lengths():
     assert_equal_length(r'"\[Integral]"', 3)
     # Prescanner keep both slashes and quotes.
@@ -48,34 +50,19 @@ def test_text_lengths():
     assert_equal_length(r'"\\[Integral]"', 14)
 
 
-def test_oct():
-    assert_equal(r"\051", ")")
-
-
-def test_hex_dot():
-    assert_equal(r"\.30", "0")
-
-
-def test_hex_colon():
-    assert_equal(r"\:0030", "0")
-    assert_equal(r"\:03B8", "\u03B8")
-    assert_equal(r"\:03b8", "\u03B8")
-
-
-def test_hex_vbar():
-    assert_equal(r"\|01D451", "\U0001D451")
-
-
+@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner")
 def test_incomplete():
     assert_incomplete(r"\[")
     assert_incomplete(r"\[Theta")
 
 
+@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner")
 def test_invalid_octal():
     assert_invalid(r"\093")
     assert_invalid(r"\01")
 
 
+@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner")
 def test_invalid_colon():
     assert_invalid(r"\:")
     assert_invalid(r"\:A")
@@ -88,18 +75,22 @@ def test_invalid_colon():
     assert_invalid(r"\:01-2")
 
 
+@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner")
 def test_invalid_dot():
     assert_invalid(r"\.")
     assert_invalid(r"\.0")
 
 
+@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner")
 def test_combined():
     assert_equal(r"\:03B8\[Theta]\.30\052", "\u03B8\u03B80*")
 
 
+@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner")
 def test_nested():
     assert_equal(r"\[Thet\141]", r"\[Thet\141]")
 
 
+@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner")
 def test_trailing_backslash():
     assert_incomplete("x \\")
diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py
index 3715f41f..a6679823 100644
--- a/test/test_string_tokens.py
+++ b/test/test_string_tokens.py
@@ -59,19 +59,13 @@ def test_string():
     # Number conversions for binary, octal, hexadecimal
     check_string(r'"\\c"', '"\\c"', "escaped backslash at beginning of string")
     check_string(r'"a\\b"', r'"a\b"', "escaped backslash")
-    check_string(r'"\:03B8"', '"θ"', "4-digit hexadecimal number test")
     check_string(r'"\102"', '"B"', "Octal number test")
-    check_string(r'"\.42"', '"B"', "2-digit hexadecimal number test")
     check_string(r'"q\.b4"', '"q´"')
 
     # All valid ASCII-like control escape sequences
     for escape_string in ("\b", "\f", "\n", "\r", "\t"):
         check_string(f'"a{escape_string}"', f'"a{escape_string}"')
 
-    check_string(r'"a\050"', r'"a("', "Octal '(' in string")
-    check_string(r'"a\051"', r'"a)"', "Octal ')' in string")
-    check_string(r'"a\052"', r'"a*"', "Octal '*' in string")
-
     check_string(r'"abc"', r'"abc"')
     check_string(r'"abc(*def*)"', r'"abc(*def*)"')
     # check_string(r'"a\"b\\c"', r'"a\\"b\c"')
@@ -80,3 +74,40 @@ def test_string():
     scan_error(r'"a\g"', "Unknown string escape \\g")
 
     scan_error(r'"a\X"', '"X" is not a valid escape character')
+
+
+# https://www.wolfram.com/language/12/networking-and-system-operations/use-the-full-range-of-unicode-characters.html
+# describes hex encoding.
+
+
+def test_octal():
+    check_string(r'"a\050"', r'"a("', "Octal '(' in string")
+    check_string(r'"a\051"', r'"a)"', "Octal ')' in string")
+    check_string(r'"a\052"', r'"a*"', "Octal '*' in string")
+    # FIXME: add tests ouside of string
+
+
+def test_hexadecimal_dot():
+    check_string(r'"\.30"', '"0"', "2-digit hexadecimal ASCII number 0")
+    check_string(r'"\.42"', '"B"', "2-digit hexadecimal ASCII capital B")
+    # FIXME: add tests ouside of string
+
+
+def test_hexadecimal_colon():
+    check_string(
+        r'"\:03B8"',
+        '"θ"',
+        "4-digit hexadecimal number test with uppercase alpha letter",
+    )
+    check_string(
+        r'"\:03b8"',
+        '"\u03B8"',
+        "4-digit hexadecimal number test with lowercase alpha lettter",
+    )
+    check_string(r'"\:0030"', '"0"')
+    # FIXME:
+    # check_string(r"\:03b8", "\u03B8", "4-digit hexadecimal number test with lowercase alpha lettter")
+
+
+def test_hexadecimal_vbar():
+    check_string(r'"\|01D451"', '"\U0001D451"')
diff --git a/test/test_tokeniser.py b/test/test_tokeniser.py
index 6e336c00..96ceee8d 100644
--- a/test/test_tokeniser.py
+++ b/test/test_tokeniser.py
@@ -90,6 +90,7 @@ def test_association():
     ]
 
 
+@pytest.mark.skip("Backslash needs to be hanndled outside of prescanner")
 def test_backslash():
     assert tokens("\\[Backslash]") == [Token("Backslash", "\u2216", 0)]
 

From 12482554d810ebbaaaedda2720fb99cb59b0d0f2 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Sat, 12 Apr 2025 14:54:40 -0400
Subject: [PATCH 03/34] Isolate tokenizing escape sequences

---
 mathics_scanner/tokeniser.py | 61 +++---------------------------------
 1 file changed, 5 insertions(+), 56 deletions(-)

diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 24406d35..3a597b93 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -10,7 +10,7 @@
 import string
 from typing import Dict, List, Optional, Tuple
 
-from mathics_scanner.characters import _letterlikes, _letters
+from mathics_scanner.characters import _letterlikes, _letters, named_characters
 from mathics_scanner.errors import IncompleteSyntaxError, ScanError
 from mathics_scanner.prescanner import Prescanner
 
@@ -778,61 +778,10 @@ def t_String(self, _: re.Match) -> Token:
                     # quote ("). Fetch aanother line.
                     self.get_more_input()
                 self.pos += 1
-                c = source_text[self.pos]
-                if c == "\\":
-                    result += "\\"
-                    self.pos += 1
-                    continue
-                # https://www.wolfram.com/language/12/networking-and-system-operations/use-the-full-range-of-unicode-characters.html
-                # describes hex encoding.
-                elif c == ".":
-                    # See if we have a 2-digit hexadecimal number.
-                    # For example, \.42 is "B"
-                    result += self.try_parse_base(1, 3, 16)
-                    self.pos += 3
-                elif c == ":":
-                    # See if we have a 4-digit hexadecimal number.
-                    # For example, \:03B8" is Unicode small leter theta: θ.
-                    result += self.try_parse_base(1, 5, 16)
-                    self.pos += 5
-                elif c == "|":
-                    # See if we have a 6-digit hexadecimal number.
-                    result += self.try_parse_base(1, 7, 16)
-                    self.pos += 7
-                elif c == "[":
-                    named_character = self.try_parse_named_character(2)
-                    if named_character is not None:
-                        result += named_character
-                        self.pos += 4  # ???
-                elif c in "01234567":
-                    # See if we have a 3-digit octal number.
-                    # For example \065 = "5"
-                    result += self.try_parse_base(0, 3, 8)
-                    self.pos += 3
-
-                # WMA escape characters \n, \t, \b, \r.
-                # Note that these are a similer to Python, but are different.
-                # In particular, Python defines "\a" to be ^G (control G),
-                # but in WMA, this is invalid.
-                elif c in "ntbfr":
-                    if c == "n":
-                        result += "\n"
-                    elif c == "t":
-                        result += "\t"
-                    elif c == "b":
-                        result += "\b"
-                    elif c == "f":
-                        result += "\f"
-                    else:
-                        assert c == "r"
-                        result += "\r"
-                    self.pos += 1
-                elif c in '!"':
-                    result += c
-                    self.pos += 1
-                else:
-                    self.sntx_invalid_esc_message(c)
-                    raise ScanError()
+                escape_str, self.pos = self.prescanner.tokenize_escape_sequence(
+                    source_text, self.pos
+                )
+                result += escape_str
             else:
                 result += self.source_text[self.pos]
                 self.pos += 1

From 95bd1052792685e2d3ed4c5c743752e5a3cbf817 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Sun, 13 Apr 2025 13:06:19 -0400
Subject: [PATCH 04/34] Split out escape_sequence parsing.

---
 mathics_scanner/escape_sequences.py | 126 ++++++++++++++++++++++++++++
 mathics_scanner/tokeniser.py        |   9 +-
 test/test_escape_sequences.py       |  32 +++++++
 test/test_string_tokens.py          |  42 +++++-----
 4 files changed, 182 insertions(+), 27 deletions(-)
 create mode 100644 mathics_scanner/escape_sequences.py
 create mode 100644 test/test_escape_sequences.py

diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py
new file mode 100644
index 00000000..f30deae8
--- /dev/null
+++ b/mathics_scanner/escape_sequences.py
@@ -0,0 +1,126 @@
+"""
+Helper Module for tokenizing character escape sequences.
+"""
+
+from typing import Optional, Tuple
+
+from mathics_scanner.characters import named_characters
+from mathics_scanner.errors import (
+    EscapeSyntaxError,
+    NamedCharacterSyntaxError,
+    ScanError,
+)
+
+
+def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) -> str:
+    r"""
+    See if characters start_shift .. end shift
+    can be converted to an integer in base  ``base``.
+
+    If so, chr(integer value converted from base).
+
+    However, if the conversion fails, then error messages are
+    issued and nothing is updated
+    """
+    start, end = start_shift, end_shift
+    result = None
+    if end <= len(source_text):
+        text = source_text[start:end]
+        try:
+            result = int(text, base)
+        except ValueError:
+            pass  # result remains None
+    if result is None:
+        last = end - start
+        if last == 2:
+            tag = "sntoct2"
+        elif last == 3:
+            tag = "sntoct1"
+        elif last == 4:
+            tag = "snthex"
+        else:
+            raise ValueError()
+        raise ScanError(tag, source_text[start_shift:].rstrip("\n"))
+
+    return chr(result)
+
+
+def parse_named_character(
+    source_text: str, pos: int, start_shift: int
+) -> Optional[str]:
+    r"""Before calling we have matched "\[".  Scan to the remaining "]" and
+    try to match what is found in-between with a known named
+    character, e.g. "Theta".  If we can match this, we store
+    the unicode character equivalent in ``line_fragments``.
+    If we can't find a named character, error messages are
+    issued and we leave ``line_fragments`` untouched.
+    """
+    named_character = source_text[pos + start_shift : pos + start_shift]
+    if named_character.isalpha():
+        char = named_characters.get(named_character)
+        if char is None:
+            raise NamedCharacterSyntaxError("sntufn", named_character)
+        else:
+            return named_character
+
+
+def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]:
+    """
+    Given some source text `source_text` at position `pos`, return the escape sequence and the
+    follow-on position.
+    """
+    result = ""
+    c = source_text[pos]
+    if c == "\\":
+        return "\\", pos + 1
+
+    # https://www.wolfram.com/language/12/networking-and-system-operations/use-the-full-range-of-unicode-characters.html
+    # describes hex encoding.
+    if c == ".":
+        # See if we have a 2-digit hexadecimal number.
+        # For example, \.42 is "B"
+        result += parse_base(source_text, pos + 1, pos + 3, 16)
+        pos += 3
+    elif c == ":":
+        # See if we have a 4-digit hexadecimal number.
+        # For example, \:03B8" is Unicode small leter theta: θ.
+        result += parse_base(source_text, pos + 1, pos + 5, 16)
+        pos += 5
+    elif c == "|":
+        # See if we have a 6-digit hexadecimal number.
+        result += parse_base(source_text, pos + 1, pos + 7, 16)
+        pos += 7
+    elif c == "[":
+        named_character = parse_named_character(source_text, pos, 2)
+        if named_character is not None:
+            result += named_character
+            pos += 4  # ???
+    elif c in "01234567":
+        # See if we have a 3-digit octal number.
+        # For example \065 = "5"
+        result += parse_base(source_text, pos, pos + 3, 8)
+        pos += 3
+
+    # WMA escape characters \n, \t, \b, \r.
+    # Note that these are a similer to Python, but are different.
+    # In particular, Python defines "\a" to be ^G (control G),
+    # but in WMA, this is invalid.
+    elif c in "ntbfr":
+        if c == "n":
+            result += "\n"
+        elif c == "t":
+            result += "\t"
+        elif c == "b":
+            result += "\b"
+        elif c == "f":
+            result += "\f"
+        else:
+            assert c == "r"
+            result += "\r"
+        pos += 1
+    elif c in '!"':
+        result += c
+        pos += 1
+    else:
+        raise EscapeSyntaxError("Syntax", "stresc" rf"\{c}.")
+    return result, pos
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 3a597b93..62ea3dec 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -12,7 +12,6 @@
 
 from mathics_scanner.characters import _letterlikes, _letters, named_characters
 from mathics_scanner.errors import IncompleteSyntaxError, ScanError
-from mathics_scanner.prescanner import Prescanner
 
 try:
     import ujson
@@ -568,8 +567,8 @@ def sntx_message(self, pos: Optional[int] = None) -> Tuple[str, str, str]:
             pos = self.pos
         pre, post = self.source_text[:pos], self.source_text[pos:].rstrip("\n")
         if pos == 0:
-            self.feeder.message("Syntax", "sntxb", post)
-            return "sntxb", "", post
+            self.feeder.message("Syntax", "sntxb", pre, post)
+            return "sntxb", pre, post
         else:
             self.feeder.message("Syntax", "sntxf", pre, post)
             return "sntxf", pre, post
@@ -778,9 +777,7 @@ def t_String(self, _: re.Match) -> Token:
                     # quote ("). Fetch aanother line.
                     self.get_more_input()
                 self.pos += 1
-                escape_str, self.pos = self.prescanner.tokenize_escape_sequence(
-                    source_text, self.pos
-                )
+                escape_str, self.pos = parse_escape_sequence(source_text, self.pos)
                 result += escape_str
             else:
                 result += self.source_text[self.pos]
diff --git a/test/test_escape_sequences.py b/test/test_escape_sequences.py
new file mode 100644
index 00000000..889e46c2
--- /dev/null
+++ b/test/test_escape_sequences.py
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+from mathics_scanner.escape_sequences import parse_escape_sequence
+
+
+def test_escape_sequences():
+    for text, pos, expect_pos, expect_str, fail_msg in (
+        # Backslash
+        ("\\\\", 0, 1, "\\", "backslash"),
+        ("abc \\\\", 5, 6, "\\", "backslash at end"),
+        ("abc \\\\n", 5, 6, "\\", "backslash in middle"),
+        # Octal
+        (r"051", 0, 3, chr(0o51), "character at beginning"),
+        (r"a\051", 2, 5, chr(0o51), "Octal character in middle"),
+        # With dot
+        (r".30", 0, 3, chr(0x30), "two-character hex"),
+        (
+            r"a\.3015",
+            2,
+            5,
+            chr(0x30),
+            "two-character hex in middle with trailing digits",
+        ),
+        (r"b\.4dXYZ", 2, 5, chr(0x4D), "two-character hex in middle"),
+        # With colon
+        (r":0030", 0, 5, "0", "four-character hex"),
+        (r":03B8", 0, 5, "\u03B8", "four-character hex unicode uppercase"),
+        (r":03B8", 0, 5, "\u03b8", "four-character hex unicode lowercase"),
+        # With Vertical bar
+        (r"|01d451", 0, 7, "\U0001D451", "six-character hex unicode lowercase"),
+        (r"|01D451", 0, 7, "\U0001D451", "six-character hex unicode uppercase"),
+    ):
+        assert parse_escape_sequence(text, pos) == (expect_str, expect_pos), fail_msg
diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py
index a6679823..e42a5bee 100644
--- a/test/test_string_tokens.py
+++ b/test/test_string_tokens.py
@@ -7,7 +7,7 @@
 
 import pytest
 
-from mathics_scanner.errors import IncompleteSyntaxError, ScanError
+from mathics_scanner.errors import EscapeSyntaxError, IncompleteSyntaxError
 from mathics_scanner.feed import SingleLineFeeder
 from mathics_scanner.tokeniser import Token, Tokeniser
 
@@ -29,8 +29,8 @@ def incomplete_error(s: str, failure_msg: str):
     assert excinfo, failure_msg
 
 
-def scan_error(s: str, failure_msg: str):
-    with pytest.raises(ScanError) as excinfo:
+def escape_scan_error(s: str, failure_msg: str):
+    with pytest.raises(EscapeSyntaxError) as excinfo:
         get_tokens(s)
 
     assert excinfo, failure_msg
@@ -56,24 +56,24 @@ def get_tokens(source_text: str):
 
 
 def test_string():
-    # Number conversions for binary, octal, hexadecimal
-    check_string(r'"\\c"', '"\\c"', "escaped backslash at beginning of string")
-    check_string(r'"a\\b"', r'"a\b"', "escaped backslash")
-    check_string(r'"\102"', '"B"', "Octal number test")
-    check_string(r'"q\.b4"', '"q´"')
-
-    # All valid ASCII-like control escape sequences
-    for escape_string in ("\b", "\f", "\n", "\r", "\t"):
-        check_string(f'"a{escape_string}"', f'"a{escape_string}"')
-
-    check_string(r'"abc"', r'"abc"')
-    check_string(r'"abc(*def*)"', r'"abc(*def*)"')
-    # check_string(r'"a\"b\\c"', r'"a\\"b\c"')
-    incomplete_error(r'"abc', "String does not have terminating quote")
-    incomplete_error(r'"\"', "Unterminated escape sequence")
-    scan_error(r'"a\g"', "Unknown string escape \\g")
-
-    scan_error(r'"a\X"', '"X" is not a valid escape character')
+    # # Number conversions for binary, octal, hexadecimal
+    # check_string(r'"\\c"', '"\\c"', "escaped backslash at beginning of string")
+    # check_string(r'"a\\b"', r'"a\b"', "escaped backslash")
+    # check_string(r'"\102"', '"B"', "Octal number test")
+    # check_string(r'"q\.b4"', '"q´"')
+
+    # # All valid ASCII-like control escape sequences
+    # for escape_string in ("\b", "\f", "\n", "\r", "\t"):
+    #     check_string(f'"a{escape_string}"', f'"a{escape_string}"')
+
+    # check_string(r'"abc"', r'"abc"')
+    # check_string(r'"abc(*def*)"', r'"abc(*def*)"')
+    # # check_string(r'"a\"b\\c"', r'"a\\"b\c"')
+    # incomplete_error(r'"abc', "String does not have terminating quote")
+    # incomplete_error(r'"\"', "Unterminated escape sequence")
+    escape_scan_error(r'"a\g"', "Unknown string escape \\g")
+
+    escape_scan_error(r'"a\X"', '"X" is not a valid escape character')
 
 
 # https://www.wolfram.com/language/12/networking-and-system-operations/use-the-full-range-of-unicode-characters.html

From f1a06e15bf4c10cf76ddbbf31f85f730ecbee87a Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Mon, 14 Apr 2025 10:57:09 -0400
Subject: [PATCH 05/34] Handle escape sequences outside of strings.

---
 mathics_scanner/escape_sequences.py | 33 +++++++-----
 mathics_scanner/tokeniser.py        | 78 ++---------------------------
 test/test_escape_sequences.py       | 21 +++++---
 3 files changed, 39 insertions(+), 93 deletions(-)

diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py
index f30deae8..35f6ab38 100644
--- a/mathics_scanner/escape_sequences.py
+++ b/mathics_scanner/escape_sequences.py
@@ -45,9 +45,7 @@ def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) ->
     return chr(result)
 
 
-def parse_named_character(
-    source_text: str, pos: int, start_shift: int
-) -> Optional[str]:
+def parse_named_character(source_text: str, start: int, finish: int) -> Optional[str]:
     r"""Before calling we have matched "\[".  Scan to the remaining "]" and
     try to match what is found in-between with a known named
     character, e.g. "Theta".  If we can match this, we store
@@ -55,13 +53,13 @@ def parse_named_character(
     If we can't find a named character, error messages are
     issued and we leave ``line_fragments`` untouched.
     """
-    named_character = source_text[pos + start_shift : pos + start_shift]
+    named_character = source_text[start:finish]
     if named_character.isalpha():
         char = named_characters.get(named_character)
         if char is None:
             raise NamedCharacterSyntaxError("sntufn", named_character)
         else:
-            return named_character
+            return char
 
 
 def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]:
@@ -77,24 +75,35 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]:
     # https://www.wolfram.com/language/12/networking-and-system-operations/use-the-full-range-of-unicode-characters.html
     # describes hex encoding.
     if c == ".":
-        # See if we have a 2-digit hexadecimal number.
-        # For example, \.42 is "B"
+        # see if we have a 2-digit hexadecimal number.
+        # for example, \.42 is "b"
         result += parse_base(source_text, pos + 1, pos + 3, 16)
         pos += 3
     elif c == ":":
-        # See if we have a 4-digit hexadecimal number.
-        # For example, \:03B8" is Unicode small leter theta: θ.
+        # see if we have a 4-digit hexadecimal number.
+        # for example, \:03b8" is unicode small leter theta: θ.
         result += parse_base(source_text, pos + 1, pos + 5, 16)
         pos += 5
     elif c == "|":
-        # See if we have a 6-digit hexadecimal number.
+        # see if we have a 6-digit hexadecimal number.
         result += parse_base(source_text, pos + 1, pos + 7, 16)
         pos += 7
     elif c == "[":
-        named_character = parse_named_character(source_text, pos, 2)
+        pos += 1
+        i = pos + 1
+        while i < len(source_text):
+            if source_text[i] == "]":
+                break
+            i += 1
+        if i == len(source_text):
+            # Note: named characters do not have \n's in them. (Is this right)?
+            # FIXME: decide what to do here.
+            raise EscapeSyntaxError("Syntax", "stresc" rf"\{c}.")
+
+        named_character = parse_named_character(source_text, pos, i)
         if named_character is not None:
             result += named_character
-            pos += 4  # ???
+            pos = i + 1
     elif c in "01234567":
         # See if we have a 3-digit octal number.
         # For example \065 = "5"
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 62ea3dec..977f97a2 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -10,8 +10,9 @@
 import string
 from typing import Dict, List, Optional, Tuple
 
-from mathics_scanner.characters import _letterlikes, _letters, named_characters
+from mathics_scanner.characters import _letterlikes, _letters
 from mathics_scanner.errors import IncompleteSyntaxError, ScanError
+from mathics_scanner.escape_sequences import parse_escape_sequence
 
 try:
     import ujson
@@ -516,10 +517,9 @@ def __init__(self, feeder):
         )
         self.pos: int = 0
         self.feeder = feeder
+        self.source_text = ""
 
-        # FIXME: remove this
-        self.prescanner = Prescanner(feeder)
-        self.source_text = self.prescanner.replace_escape_sequences()
+        # FIXME: remove this.
         self.mode: str = "invalid"
 
         # Set to True when inside box parsing.
@@ -608,85 +608,17 @@ def next(self) -> Token:
         if override is not None:
             return override(pattern_match)
 
-        # Failing a custom tokenization rule, we use the regular expression
-        # pattern match.
         text = pattern_match.group(0)
         self.pos = pattern_match.end(0)
         return Token(tag, text, pattern_match.start(0))
 
-    def try_parse_base(self, start_shift: int, end_shift: int, base: int) -> str:
-        r"""
-        See if characters self.pos+start_shift .. self.pos+end shift
-        can be converted to an integer in base  ``base``.
-
-        If so, chr(integer value converted from base).
-
-        However, if the conversion fails, then error messages are
-        issued and nothing is updated
-        """
-        start, end = self.pos + start_shift, self.pos + end_shift
-        result = None
-        if end <= len(self.source_text):
-            text = self.source_text[start:end]
-            try:
-                result = int(text, base)
-            except ValueError:
-                pass  # result remains None
-        if result is None:
-            last = end - start
-            if last == 2:
-                self.feeder.message("Syntax", "sntoct2")
-            elif last == 3:
-                self.feeder.message("Syntax", "sntoct1")
-            elif last == 4:
-                self.feeder.message("Syntax", "snthex")
-            else:
-                raise ValueError()
-            error_text = self.source_text[self.pos :].rstrip("\n")
-            self.feeder.message("Syntax", "sntxb", error_text)
-            raise ScanError("syntx", error_text)
-
-        return chr(result)
-
-    def try_parse_named_character(self, start_shift: int) -> Optional[str]:
-        r"""Before calling we have matched "\[".  Scan to the remaining "]" and
-        try to match what is found in-between with a known named
-        character, e.g. "Theta".  If we can match this, we store
-        the unicode character equivalent in ``line_fragments``.
-        If we can't find a named character, error messages are
-        issued and we leave ``line_fragments`` untouched.
-        """
-        named_character = self.source_text[self.pos + start_shift : self.pos + start_shift]
-        if named_character.isalpha():
-            char = named_characters.get(named_character)
-            if char is None:
-                self.feeder.message("Syntax", "sntufn", named_character)
-            else:
-                return named_character
-
     def _skip_blank(self):
         "Skip whitespace and comments"
         comment = []  # start positions of comments
         while True:
             if self.pos >= len(self.source_text):
                 if comment:
-                    try:
-                        self.get_more_input()
-                    except ValueError:
-                        # `get_more_input` tries to parse substrings like `\|AAAAA`
-                        # that can be interpreted as a character reference.
-                        # To do that, it tries to get the
-                        # new line using the method
-                        # `Prescanner.replace_escape_sequences()`
-                        # Inside a comment, the special meaning of escape sequences
-                        # like `\|` should not be taken into account.
-                        #
-                        # In case of error, just let's pick the code
-                        # from the `input_line` attribute of
-                        # prescanner:
-                        self.source_text = self.prescanner.input_line
-                        # TODO: handle the corner case where the rest of the line
-                        # include escaped sequences, out of the comment.
+                    self.get_more_input()
                 else:
                     break
             if comment:
diff --git a/test/test_escape_sequences.py b/test/test_escape_sequences.py
index 889e46c2..86ef583b 100644
--- a/test/test_escape_sequences.py
+++ b/test/test_escape_sequences.py
@@ -9,24 +9,29 @@ def test_escape_sequences():
         ("abc \\\\", 5, 6, "\\", "backslash at end"),
         ("abc \\\\n", 5, 6, "\\", "backslash in middle"),
         # Octal
-        (r"051", 0, 3, chr(0o51), "character at beginning"),
+        (r"050", 0, 3, chr(0o50), "character at beginning"),
         (r"a\051", 2, 5, chr(0o51), "Octal character in middle"),
-        # With dot
+        # With dot (2-character hex)
         (r".30", 0, 3, chr(0x30), "two-character hex"),
         (
-            r"a\.3015",
+            r"a\.3115",
             2,
             5,
-            chr(0x30),
+            chr(0x31),
             "two-character hex in middle with trailing digits",
         ),
         (r"b\.4dXYZ", 2, 5, chr(0x4D), "two-character hex in middle"),
-        # With colon
+        # With colon (4-character hex)
         (r":0030", 0, 5, "0", "four-character hex"),
-        (r":03B8", 0, 5, "\u03B8", "four-character hex unicode uppercase"),
+        (r":03B5", 0, 5, "\u03B5", "four-character hex unicode uppercase"),
         (r":03B8", 0, 5, "\u03b8", "four-character hex unicode lowercase"),
-        # With Vertical bar
-        (r"|01d451", 0, 7, "\U0001D451", "six-character hex unicode lowercase"),
+        # With Vertical bar (6-character hex)
+        (r"|01d450", 0, 7, "\U0001D450", "six-character hex unicode lowercase"),
         (r"|01D451", 0, 7, "\U0001D451", "six-character hex unicode uppercase"),
+        # Named Characters
+        ("[Theta]", 0, 7, "\u03B8", "Named character; full string"),
+        ("abcd[CapitalPi]efg", 4, 15, "\u03A0", "Named character; internal"),
+        (r"z \[Conjugate]", 3, 14, "\uF3C8", "Named character; at end"),
+        ("[Integral]", 0, 10, "\u222b", "Another full-string named-character"),
     ):
         assert parse_escape_sequence(text, pos) == (expect_str, expect_pos), fail_msg

From f6846a2d22bccb109d2c693d3d08a29975320f87 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Mon, 14 Apr 2025 11:25:02 -0400
Subject: [PATCH 06/34] Remove prescanner and ..

handle syntax errors in mathics3-tokens.
---
 mathics_scanner/escape_sequences.py | 4 ++--
 mathics_scanner/tokeniser.py        | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py
index 35f6ab38..8f23b2b6 100644
--- a/mathics_scanner/escape_sequences.py
+++ b/mathics_scanner/escape_sequences.py
@@ -114,8 +114,8 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]:
     # Note that these are a similer to Python, but are different.
     # In particular, Python defines "\a" to be ^G (control G),
     # but in WMA, this is invalid.
-    elif c in "ntbfr":
-        if c == "n":
+    elif c in "ntbfr\n":
+        if c in "n\n":
             result += "\n"
         elif c == "t":
             result += "\t"
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 977f97a2..3f442556 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -517,9 +517,7 @@ def __init__(self, feeder):
         )
         self.pos: int = 0
         self.feeder = feeder
-        self.source_text = ""
-
-        # FIXME: remove this.
+        self.source_text = self.feeder.feed()
         self.mode: str = "invalid"
 
         # Set to True when inside box parsing.
@@ -579,7 +577,7 @@ def next(self) -> Token:
         "Returns the next token from self.source_text."
         self._skip_blank()
         source_text = self.source_text
-        if self.pos >= len(self.source_text):
+        if self.pos >= len(source_text):
             return Token("END", "", len(source_text))
 
         # Look for a matching pattern.

From ccfe94357320fd22672002d0ca59e98edf662ace Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Mon, 14 Apr 2025 11:57:51 -0400
Subject: [PATCH 07/34] Rename some variables

Tokenizer.code -> Tokenizer.source_text
Tokenizer.incomplete -> Tokenizer.get_more_input
---
 mathics_scanner/tokeniser.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 3f442556..52efc21b 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -453,7 +453,7 @@ def is_symbol_name(text: str) -> bool:
 class Token:
     """A representation of a Wolfram-Language token.
 
-    Tokens are parsed by parser uses to build M-expressions.
+    Tokens are parsed by the parser; and are used to build M-expressions.
 
     A token has a `tag`, the class or type of the token. For example:
     a Number, Symbol, String, File, etc.
@@ -466,11 +466,6 @@ class Token:
     """
 
     def __init__(self, tag: str, text: str, pos: int):
-        """
-        :param tag: which type of token this is.
-        :param text: The actual contents of the token.
-        :param pos: The position of the token in the input feed.
-        """
         self.tag = tag
         self.text = text
         self.pos = pos
@@ -518,6 +513,7 @@ def __init__(self, feeder):
         self.pos: int = 0
         self.feeder = feeder
         self.source_text = self.feeder.feed()
+
         self.mode: str = "invalid"
 
         # Set to True when inside box parsing.

From 3d0a2f72ef38d55eee9edde159ff42c8dfdf0cf5 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Mon, 14 Apr 2025 13:00:58 -0400
Subject: [PATCH 08/34] Bang more on mathics3-tokens

Start to show syntax errors.
---
 mathics_scanner/mathics3_tokens.py |  1 +
 mathics_scanner/tokeniser.py       | 39 ++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/mathics_scanner/mathics3_tokens.py b/mathics_scanner/mathics3_tokens.py
index db6a163c..8f06db74 100644
--- a/mathics_scanner/mathics3_tokens.py
+++ b/mathics_scanner/mathics3_tokens.py
@@ -181,6 +181,7 @@ def interactive_eval_loop(shell: TerminalShell, code_tokenize_format: bool):
                 "sntufn",
                 "Unknown unicode longname",
             )
+
         except KeyboardInterrupt:
             print("\nKeyboardInterrupt. Type Ctrl-D (EOF) to exit.")
         except EOFError:
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 52efc21b..8f8a0e7a 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -602,6 +602,8 @@ def next(self) -> Token:
         if override is not None:
             return override(pattern_match)
 
+        # Failing a custom tokenization rule, we use the regular expression
+        # pattern match.
         text = pattern_match.group(0)
         self.pos = pattern_match.end(0)
         return Token(tag, text, pattern_match.start(0))
@@ -672,6 +674,43 @@ def t_PutAppend(self, pattern_match: re.Match) -> Token:
         "Scan for a ``PutAppend`` token and return that"
         return self._token_mode(pattern_match, "PutAppend", "filename")
 
+    def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
+        """Break out from ``pattern_match`` tokens which start with \\"""
+        source_text = self.source_text
+        start_pos = self.pos + 1
+        if start_pos == len(source_text):
+            # We have reached end of the input line before seeing a terminating
+            # quote ("). Fetch another line.
+            self.get_more_input()
+            self.pos += 1
+            source_text += self.source_text
+        escape_str, self.pos = parse_escape_sequence(source_text, start_pos)
+
+        # DRY with "next()"
+        # look for a matching pattern
+        indices = self.token_indices.get(escape_str[0], ())
+        pattern_match = None
+        tag = "??invalid"
+        if indices:
+            for index in indices:
+                tag, pattern = self.tokens[index]
+                pattern_match = pattern.match(escape_str, 0)
+                if pattern_match is not None:
+                    break
+        else:
+            for tag, pattern in self.tokens:
+                pattern_match = pattern.match(escape_str, start_pos)
+                if pattern_match is not None:
+                    break
+
+        # no matching pattern found
+        if pattern_match is None:
+            tag, pre, post = self.sntx_message()
+            raise ScanError(tag, pre, post)
+
+        text = pattern_match.group(0)
+        return Token(tag, text, pattern_match.start(0))
+
     def t_String(self, _: re.Match) -> Token:
         """Break out from self.source_text the next token which is expected to be a String.
         The string value of the returned token will have double quote (") in the first and last

From 1c03e8bbd84506dd73c5b252a2c96950bfc4cc61 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Tue, 15 Apr 2025 11:10:18 -0400
Subject: [PATCH 09/34] Start going over error messages...

In particular errors with octal digits and incomplete named errors.
Go over docstrings in escape_sequences.py
---
 mathics_scanner/escape_sequences.py | 56 +++++++++++++++--------------
 mathics_scanner/mathics3_tokens.py  |  1 -
 mathics_scanner/tokeniser.py        |  6 +++-
 test/test_escape_sequences.py       |  9 +++++
 4 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py
index 8f23b2b6..913acaf0 100644
--- a/mathics_scanner/escape_sequences.py
+++ b/mathics_scanner/escape_sequences.py
@@ -17,24 +17,20 @@ def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) ->
     See if characters start_shift .. end shift
     can be converted to an integer in base  ``base``.
 
-    If so, chr(integer value converted from base).
+    If so, chr(integer value converted from base) is returnd.
 
-    However, if the conversion fails, then error messages are
-    issued and nothing is updated
+    However, if the conversion fails, ScanError is raised.
     """
-    start, end = start_shift, end_shift
-    result = None
-    if end <= len(source_text):
-        text = source_text[start:end]
-        try:
-            result = int(text, base)
-        except ValueError:
-            pass  # result remains None
-    if result is None:
-        last = end - start
+    assert start_shift <= end_shift <= len(source_text)
+    text = source_text[start_shift:end_shift]
+    try:
+        result = int(text, base)
+    except ValueError:
+        last = start_shift - end_shift
         if last == 2:
             tag = "sntoct2"
         elif last == 3:
+            assert base == 8, "Only octal requires 3 digits"
             tag = "sntoct1"
         elif last == 4:
             tag = "snthex"
@@ -46,12 +42,18 @@ def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) ->
 
 
 def parse_named_character(source_text: str, start: int, finish: int) -> Optional[str]:
-    r"""Before calling we have matched "\[".  Scan to the remaining "]" and
-    try to match what is found in-between with a known named
-    character, e.g. "Theta".  If we can match this, we store
-    the unicode character equivalent in ``line_fragments``.
-    If we can't find a named character, error messages are
-    issued and we leave ``line_fragments`` untouched.
+    r"""
+    Find the unicode-equivalent symbol for a string named character.
+
+    Before calling we have matched the text between "\["  and "]" of the input.
+
+    The name character is thus in source_text[start:finish].
+
+    Match this string with the known named characters,
+    e.g. "Theta".  If we can match this, then we return the unicode equivalent from the
+    `named_characters` map (which is read in from JSON but stored in a YAML file).
+
+    If we can't find the named character, rasie NamedCharacterSyntaxError.
     """
     named_character = source_text[start:finish]
     if named_character.isalpha():
@@ -63,9 +65,9 @@ def parse_named_character(source_text: str, start: int, finish: int) -> Optional
 
 
 def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]:
-    """
-    Given some source text `source_text` at position `pos`, return the escape sequence and the
-    follow-on position.
+    """Given some source text in `source_text` starting at offset
+    `pos`, return the escape-sequence value for this text and the
+    follow-on offset position.
     """
     result = ""
     c = source_text[pos]
@@ -98,12 +100,14 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]:
         if i == len(source_text):
             # Note: named characters do not have \n's in them. (Is this right)?
             # FIXME: decide what to do here.
-            raise EscapeSyntaxError("Syntax", "stresc" rf"\{c}.")
+            raise NamedCharacterSyntaxError("Syntax", "sntufn", source_text[pos:])
 
         named_character = parse_named_character(source_text, pos, i)
-        if named_character is not None:
-            result += named_character
-            pos = i + 1
+        if named_character is None:
+            raise NamedCharacterSyntaxError("Syntax", "sntufn", source_text[pos:i])
+
+        result += named_character
+        pos = i + 1
     elif c in "01234567":
         # See if we have a 3-digit octal number.
         # For example \065 = "5"
diff --git a/mathics_scanner/mathics3_tokens.py b/mathics_scanner/mathics3_tokens.py
index 8f06db74..db6a163c 100644
--- a/mathics_scanner/mathics3_tokens.py
+++ b/mathics_scanner/mathics3_tokens.py
@@ -181,7 +181,6 @@ def interactive_eval_loop(shell: TerminalShell, code_tokenize_format: bool):
                 "sntufn",
                 "Unknown unicode longname",
             )
-
         except KeyboardInterrupt:
             print("\nKeyboardInterrupt. Type Ctrl-D (EOF) to exit.")
         except EOFError:
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 8f8a0e7a..9543b19d 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -684,7 +684,11 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
             self.get_more_input()
             self.pos += 1
             source_text += self.source_text
-        escape_str, self.pos = parse_escape_sequence(source_text, start_pos)
+        try:
+            escape_str, self.pos = parse_escape_sequence(source_text, start_pos)
+        except ScanError as scan_error:
+            self.feeder.message("Syntax", scan_error.tag, scan_error.args[0])
+            raise
 
         # DRY with "next()"
         # look for a matching pattern
diff --git a/test/test_escape_sequences.py b/test/test_escape_sequences.py
index 86ef583b..b5b94201 100644
--- a/test/test_escape_sequences.py
+++ b/test/test_escape_sequences.py
@@ -1,4 +1,7 @@
 # -*- coding: utf-8 -*-
+import pytest
+
+from mathics_scanner.errors import NamedCharacterSyntaxError
 from mathics_scanner.escape_sequences import parse_escape_sequence
 
 
@@ -35,3 +38,9 @@ def test_escape_sequences():
         ("[Integral]", 0, 10, "\u222b", "Another full-string named-character"),
     ):
         assert parse_escape_sequence(text, pos) == (expect_str, expect_pos), fail_msg
+
+
+def test_incomplete_named_character_sequences():
+    for text in (r"\[", r"\[Theta", r"\[Fake]", r"\[abc]"):
+        with pytest.raises(NamedCharacterSyntaxError):
+            parse_escape_sequence(text, 1)

From 3c1b9770d11862cdd9d65f606300ab1352769aa5 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Wed, 16 Apr 2025 22:00:58 -0400
Subject: [PATCH 10/34] Improve error handling...

and add more tests.
---
 mathics_scanner/escape_sequences.py | 26 +++++++++++++++-----------
 test/test_escape_sequences.py       | 28 ++++++++++++++++++++++++++--
 2 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py
index 913acaf0..2e8c8fed 100644
--- a/mathics_scanner/escape_sequences.py
+++ b/mathics_scanner/escape_sequences.py
@@ -21,21 +21,25 @@ def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) ->
 
     However, if the conversion fails, ScanError is raised.
     """
-    assert start_shift <= end_shift <= len(source_text)
+    last = end_shift - start_shift
+    if last == 2:
+        tag = "sntoct2"
+    elif last == 3:
+        assert base == 8, "Only octal requires 3 digits"
+        tag = "sntoct1"
+    elif last in (4, 6):
+        tag = "snthex"
+    else:
+        raise ValueError()
+
+    if end_shift > len(source_text):
+        raise ScanError("Syntax", tag)
+
+    assert start_shift <= end_shift
     text = source_text[start_shift:end_shift]
     try:
         result = int(text, base)
     except ValueError:
-        last = start_shift - end_shift
-        if last == 2:
-            tag = "sntoct2"
-        elif last == 3:
-            assert base == 8, "Only octal requires 3 digits"
-            tag = "sntoct1"
-        elif last == 4:
-            tag = "snthex"
-        else:
-            raise ValueError()
         raise ScanError(tag, source_text[start_shift:].rstrip("\n"))
 
     return chr(result)
diff --git a/test/test_escape_sequences.py b/test/test_escape_sequences.py
index b5b94201..1547d066 100644
--- a/test/test_escape_sequences.py
+++ b/test/test_escape_sequences.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 import pytest
 
-from mathics_scanner.errors import NamedCharacterSyntaxError
+from mathics_scanner.errors import NamedCharacterSyntaxError, ScanError
 from mathics_scanner.escape_sequences import parse_escape_sequence
 
 
@@ -40,7 +40,31 @@ def test_escape_sequences():
         assert parse_escape_sequence(text, pos) == (expect_str, expect_pos), fail_msg
 
 
-def test_incomplete_named_character_sequences():
+def test_invalid_named_character_sequences():
     for text in (r"\[", r"\[Theta", r"\[Fake]", r"\[abc]"):
         with pytest.raises(NamedCharacterSyntaxError):
             parse_escape_sequence(text, 1)
+
+
+def test_invalid_number_encoding():
+    for text in (
+        # Octal
+        "093",  # 9 is not in 0-7
+        "01",  # need 3 characters
+        "01",  # need 3 characters
+        # 2-character hex
+        ".",
+        ".0",
+        ".0i",  # i is not in 0-f
+        # 4-character hex
+        ":",
+        ":A",
+        ":A1",
+        ":ak",
+        ":A10",
+        ":a1g",
+        ":A1g9",
+        ":01-2",
+    ):
+        with pytest.raises(ScanError):
+            parse_escape_sequence(text, 0)

From ded888503971256e531e1b56fa8753cccdbf6b39 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Wed, 14 May 2025 12:27:02 -0400
Subject: [PATCH 11/34] Improve scanner...

named-characters.yml: \[Mu] is letterlike
tokeniser.py: Correct identifier or pattern for those having letterlike escape sequences
---
 mathics_scanner/tokeniser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 9543b19d..e6f82b5c 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -703,7 +703,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
                     break
         else:
             for tag, pattern in self.tokens:
-                pattern_match = pattern.match(escape_str, start_pos)
+                pattern_match = pattern.match(escape_str, 0)
                 if pattern_match is not None:
                     break
 

From 41fdc74745bcaab3623e88b7f6da1fe0379da2cd Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Fri, 16 May 2025 13:07:47 -0400
Subject: [PATCH 12/34] Handle EscapSequence errors better

---
 mathics_scanner/escape_sequences.py |  2 +-
 mathics_scanner/tokeniser.py        | 11 ++++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py
index 2e8c8fed..dd420e2e 100644
--- a/mathics_scanner/escape_sequences.py
+++ b/mathics_scanner/escape_sequences.py
@@ -139,5 +139,5 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]:
         result += c
         pos += 1
     else:
-        raise EscapeSyntaxError("Syntax", "stresc" rf"\{c}.")
+        raise EscapeSyntaxError("Syntax", "stresc", rf"\{c}")
     return result, pos
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index e6f82b5c..481a2520 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -11,7 +11,7 @@
 from typing import Dict, List, Optional, Tuple
 
 from mathics_scanner.characters import _letterlikes, _letters
-from mathics_scanner.errors import IncompleteSyntaxError, ScanError
+from mathics_scanner.errors import EscapeSyntaxError, IncompleteSyntaxError, ScanError
 from mathics_scanner.escape_sequences import parse_escape_sequence
 
 try:
@@ -743,10 +743,15 @@ def t_String(self, _: re.Match) -> Token:
             if char == "\\":
                 if self.pos + 1 == len(source_text):
                     # We have reached end of the input line before seeing a terminating
-                    # quote ("). Fetch aanother line.
+                    # quote ("). Fetch another line.
                     self.get_more_input()
                 self.pos += 1
-                escape_str, self.pos = parse_escape_sequence(source_text, self.pos)
+                try:
+                    escape_str, self.pos = parse_escape_sequence(source_text, self.pos)
+                except EscapeSyntaxError as e:
+                    self.feeder.message(e.name, *e.args)
+                    raise
+
                 result += escape_str
             else:
                 result += self.source_text[self.pos]

From fa9b1a9c4a7ac6a865964e50fd161d79640b90bc Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Fri, 16 May 2025 22:32:46 -0400
Subject: [PATCH 13/34] Handle embedded escape sequences in Symbols...

and also add Theta to the list of letterlike symbols
---
 mathics_scanner/tokeniser.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 481a2520..14c6d46f 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -606,6 +606,27 @@ def next(self) -> Token:
         # pattern match.
         text = pattern_match.group(0)
         self.pos = pattern_match.end(0)
+
+        if tag == "Symbol":
+            # We have to keep searching for the end of the Symbol if
+            # the next symbol is a backslash, "\", because it might be a
+            # named-letterlike character such as \[Mu] or a escape representation of number or
+            # character.
+            # abc\[Mu] is a valid 4-character symbol.
+            while self.pos < len(source_text) and source_text[self.pos] == "\\":
+                try:
+                    escape_str, next_pos = parse_escape_sequence(
+                        self.source_text, self.pos + 1
+                    )
+                except ScanError as scan_error:
+                    self.feeder.message("Syntax", scan_error.tag, scan_error.args[0])
+                    raise
+                if escape_str in _letterlikes + "0123456789":
+                    text += escape_str
+                    self.pos = next_pos
+                else:
+                    break
+
         return Token(tag, text, pattern_match.start(0))
 
     def _skip_blank(self):
@@ -690,7 +711,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
             self.feeder.message("Syntax", scan_error.tag, scan_error.args[0])
             raise
 
-        # DRY with "next()"
+        # DRY with "next()?"
         # look for a matching pattern
         indices = self.token_indices.get(escape_str[0], ())
         pattern_match = None

From 8c582f5d288c2d5fbc9e3b2889e4d50378fae73f Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Sat, 17 May 2025 20:00:48 -0400
Subject: [PATCH 14/34] WIP - bang on Symbol tokenization with backslash

Replace .format() with f-strings. Add comments around Symbol pattern.

sntx_message() Excpetion now saves name, tag, and args
---
 mathics_scanner/tokeniser.py | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 14c6d46f..06cf42ae 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -612,7 +612,8 @@ def next(self) -> Token:
             # the next symbol is a backslash, "\", because it might be a
             # named-letterlike character such as \[Mu] or a escape representation of number or
             # character.
-            # abc\[Mu] is a valid 4-character symbol.
+            # abc\[Mu] is a valid 4-character Symbol. And we can have things like
+            # abc\[Mu]\[Mu]def\[Mu]1
             while self.pos < len(source_text) and source_text[self.pos] == "\\":
                 try:
                     escape_str, next_pos = parse_escape_sequence(
@@ -711,8 +712,9 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
             self.feeder.message("Syntax", scan_error.tag, scan_error.args[0])
             raise
 
-        # DRY with "next()?"
-        # look for a matching pattern
+        # Is there a way to DRY with "next()?"
+
+        # Look for a matching pattern.
         indices = self.token_indices.get(escape_str[0], ())
         pattern_match = None
         tag = "??invalid"
@@ -728,12 +730,34 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
                 if pattern_match is not None:
                     break
 
-        # no matching pattern found
+        # No matching pattern found.
         if pattern_match is None:
             tag, pre, post = self.sntx_message()
             raise ScanError(tag, pre, post)
 
         text = pattern_match.group(0)
+
+        if tag == "Symbol":
+            # We have to keep searching for the end of the Symbol if
+            # the next symbol is a backslash, "\", because it might be a
+            # named-letterlike character such as \[Mu] or a escape representation of number or
+            # character.
+            # \[Mu]2 is a valid 2-character Symbol, and we can have things like
+            # \[Mu]\[Mu]def\[Mu]1.
+            while self.pos < len(source_text) and source_text[self.pos] == "\\":
+                try:
+                    escape_str, next_pos = parse_escape_sequence(
+                        self.source_text, self.pos + 1
+                    )
+                except ScanError as scan_error:
+                    self.feeder.message("Syntax", scan_error.tag, scan_error.args[0])
+                    raise
+                if escape_str in _letterlikes + "0123456789":
+                    text += escape_str
+                    self.pos = next_pos
+                else:
+                    break
+
         return Token(tag, text, pattern_match.start(0))
 
     def t_String(self, _: re.Match) -> Token:

From c1c015ce7d08413f1bce611f9f6d09911aba7915 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Sun, 18 May 2025 12:05:39 -0400
Subject: [PATCH 15/34] Be able to whether we are in a RowBox

---
 mathics_scanner/tokeniser.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 06cf42ae..e00bb551 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -619,10 +619,16 @@ def next(self) -> Token:
                     escape_str, next_pos = parse_escape_sequence(
                         self.source_text, self.pos + 1
                     )
-                except ScanError as scan_error:
-                    self.feeder.message("Syntax", scan_error.tag, scan_error.args[0])
+                except EscapeSyntaxError as escape_error:
+                    if self.is_inside_box:
+                        # Follow-on symbol may be a escape character that can
+                        # appear only in box constructs, e.g. \%.
+                        break
+                    self.feeder.message(
+                        "Syntax", escape_error.tag, escape_error.args[0]
+                    )
                     raise
-                if escape_str in _letterlikes + "0123456789":
+                if escape_str in _letterlikes:
                     text += escape_str
                     self.pos = next_pos
                 else:

From 68346c05e6910c9daea3ae3ef2589e1308ee4aed Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Sun, 18 May 2025 22:34:43 -0400
Subject: [PATCH 16/34] Handle no-meaning operators

Not sure how this worked before, but it did.
---
 mathics_scanner/tokeniser.py | 69 ++++++++++++++++++++++++++----------
 1 file changed, 50 insertions(+), 19 deletions(-)

diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index e00bb551..773be5e4 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -625,7 +625,7 @@ def next(self) -> Token:
                         # appear only in box constructs, e.g. \%.
                         break
                     self.feeder.message(
-                        "Syntax", escape_error.tag, escape_error.args[0]
+                        escape_error.name, escape_error.tag, escape_error.args
                     )
                     raise
                 if escape_str in _letterlikes:
@@ -706,21 +706,29 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
         """Break out from ``pattern_match`` tokens which start with \\"""
         source_text = self.source_text
         start_pos = self.pos + 1
+        named_character = ""
         if start_pos == len(source_text):
-            # We have reached end of the input line before seeing a terminating
-            # quote ("). Fetch another line.
+            # We have reached end of the input line before seeing a termination
+            # of backslash. Fetch another line.
             self.get_more_input()
             self.pos += 1
             source_text += self.source_text
         try:
             escape_str, self.pos = parse_escape_sequence(source_text, start_pos)
-        except ScanError as scan_error:
-            self.feeder.message("Syntax", scan_error.tag, scan_error.args[0])
+            if source_text[start_pos] == "[" and source_text[self.pos - 1] == "]":
+                named_character = source_text[start_pos + 1 : self.pos - 1]
+        except EscapeSyntaxError as escape_error:
+            self.feeder.message(escape_error.name, escape_error.tag, escape_error.args)
             raise
 
         # Is there a way to DRY with "next()?"
 
-        # Look for a matching pattern.
+        if named_character != "":
+            if named_character in NO_MEANING_OPERATORS:
+                return Token(named_character, escape_str, start_pos - 1)
+
+        # Look for a pattern matching leading context \.
+
         indices = self.token_indices.get(escape_str[0], ())
         pattern_match = None
         tag = "??invalid"
@@ -736,7 +744,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
                 if pattern_match is not None:
                     break
 
-        # No matching pattern found.
+        # No matching found.
         if pattern_match is None:
             tag, pre, post = self.sntx_message()
             raise ScanError(tag, pre, post)
@@ -744,23 +752,44 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
         text = pattern_match.group(0)
 
         if tag == "Symbol":
-            # We have to keep searching for the end of the Symbol if
-            # the next symbol is a backslash, "\", because it might be a
-            # named-letterlike character such as \[Mu] or a escape representation of number or
-            # character.
-            # \[Mu]2 is a valid 2-character Symbol, and we can have things like
-            # \[Mu]\[Mu]def\[Mu]1.
-            while self.pos < len(source_text) and source_text[self.pos] == "\\":
+            # We have to keep searching for the end of the Symbol
+            # after an escaped letterlike-symbol.  For example, \[Mu]
+            # is a valid Symbol. But we can also have symbols for
+            # \[Mu]\[Theta], \[Mu]1, \[Mu]1a, \[Mu]\.42, \[Mu]\061, or \[Mu]\061abc
+            while True:
+                if self.pos >= len(source_text):
+                    break
+
+                # Try to extend symbol with non-escaped alphanumeric
+                # (and letterlike) symbols.
+
+                # TODO: Do we need to add context breaks? And if so,
+                # do we need to check for consecutive ``'s?
+                alphanumeric_match = re.match(
+                    f"[0-9${symbol_first_letter}]+", self.source_text[self.pos :]
+                )
+                if alphanumeric_match is not None:
+                    extension_str = alphanumeric_match.group(0)
+                    text += extension_str
+                    self.pos += len(extension_str)
+
+                if source_text[self.pos] != "\\":
+                    break
+
+                # Now try to extend symbol with *escaped* alphanumeric (and letterlike) symbols.
                 try:
                     escape_str, next_pos = parse_escape_sequence(
                         self.source_text, self.pos + 1
                     )
-                except ScanError as scan_error:
-                    self.feeder.message("Syntax", scan_error.tag, scan_error.args[0])
+                except EscapeSyntaxError as escape_error:
+                    self.feeder.message(
+                        escape_error.name, escape_error.tag, escape_error.args
+                    )
                     raise
-                if escape_str in _letterlikes + "0123456789":
+                if escape_str in _letterlikes + _letters + "0123456789$":
                     text += escape_str
                     self.pos = next_pos
+                    # Look to extend the symbol for further
                 else:
                     break
 
@@ -799,8 +828,10 @@ def t_String(self, _: re.Match) -> Token:
                 self.pos += 1
                 try:
                     escape_str, self.pos = parse_escape_sequence(source_text, self.pos)
-                except EscapeSyntaxError as e:
-                    self.feeder.message(e.name, *e.args)
+                except EscapeSyntaxError as escape_error:
+                    self.feeder.message(
+                        escape_error.name, escape_error.tag, escape_error.args
+                    )
                     raise
 
                 result += escape_str

From 3fe6a2beb3b74e97e50456d3ff8fdaccbaf074cb Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Mon, 19 May 2025 11:25:34 -0400
Subject: [PATCH 17/34] WIP misc fixes...

* "$\" is a thing
* Correct EscapeSyntaxError error message
* Better Symbol tokenization for things like a\[Mu]1. More in next
  commit though.
---
 mathics_scanner/escape_sequences.py |  7 ++++--
 mathics_scanner/tokeniser.py        | 35 ++++++++++++++++++++++++-----
 2 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py
index dd420e2e..61c56b28 100644
--- a/mathics_scanner/escape_sequences.py
+++ b/mathics_scanner/escape_sequences.py
@@ -122,7 +122,7 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]:
     # Note that these are a similer to Python, but are different.
     # In particular, Python defines "\a" to be ^G (control G),
     # but in WMA, this is invalid.
-    elif c in "ntbfr\n":
+    elif c in "ntbfr$\n":
         if c in "n\n":
             result += "\n"
         elif c == "t":
@@ -131,6 +131,9 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]:
             result += "\b"
         elif c == "f":
             result += "\f"
+        elif c == "$":
+            # I don't know why \$ is defined, but it is!
+            result += r"\$"
         else:
             assert c == "r"
             result += "\r"
@@ -139,5 +142,5 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]:
         result += c
         pos += 1
     else:
-        raise EscapeSyntaxError("Syntax", "stresc", rf"\{c}")
+        raise EscapeSyntaxError("stresc", rf"\{c}")
     return result, pos
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 773be5e4..06fe9545 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -607,6 +607,7 @@ def next(self) -> Token:
         text = pattern_match.group(0)
         self.pos = pattern_match.end(0)
 
+        # FIXME: DRY with code in RawBackslash
         if tag == "Symbol":
             # We have to keep searching for the end of the Symbol if
             # the next symbol is a backslash, "\", because it might be a
@@ -614,7 +615,26 @@ def next(self) -> Token:
             # character.
             # abc\[Mu] is a valid 4-character Symbol. And we can have things like
             # abc\[Mu]\[Mu]def\[Mu]1
-            while self.pos < len(source_text) and source_text[self.pos] == "\\":
+            while True:
+                if self.pos >= len(source_text):
+                    break
+
+                # Try to extend symbol with non-escaped alphanumeric
+                # (and letterlike) symbols.
+
+                # TODO: Do we need to add context breaks? And if so,
+                # do we need to check for consecutive ``'s?
+                alphanumeric_match = re.match(
+                    f"[0-9${symbol_first_letter}]+", self.source_text[self.pos :]
+                )
+                if alphanumeric_match is not None:
+                    extension_str = alphanumeric_match.group(0)
+                    text += extension_str
+                    self.pos += len(extension_str)
+
+                if source_text[self.pos] != "\\":
+                    break
+
                 try:
                     escape_str, next_pos = parse_escape_sequence(
                         self.source_text, self.pos + 1
@@ -625,7 +645,7 @@ def next(self) -> Token:
                         # appear only in box constructs, e.g. \%.
                         break
                     self.feeder.message(
-                        escape_error.name, escape_error.tag, escape_error.args
+                        escape_error.name, escape_error.tag, *escape_error.args
                     )
                     raise
                 if escape_str in _letterlikes:
@@ -718,7 +738,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
             if source_text[start_pos] == "[" and source_text[self.pos - 1] == "]":
                 named_character = source_text[start_pos + 1 : self.pos - 1]
         except EscapeSyntaxError as escape_error:
-            self.feeder.message(escape_error.name, escape_error.tag, escape_error.args)
+            self.feeder.message(escape_error.name, escape_error.tag, *escape_error.args)
             raise
 
         # Is there a way to DRY with "next()?"
@@ -752,6 +772,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
         text = pattern_match.group(0)
 
         if tag == "Symbol":
+            # FIXME: DRY with code in next()
             # We have to keep searching for the end of the Symbol
             # after an escaped letterlike-symbol.  For example, \[Mu]
             # is a valid Symbol. But we can also have symbols for
@@ -776,12 +797,15 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
                 if source_text[self.pos] != "\\":
                     break
 
-                # Now try to extend symbol with *escaped* alphanumeric (and letterlike) symbols.
                 try:
                     escape_str, next_pos = parse_escape_sequence(
                         self.source_text, self.pos + 1
                     )
                 except EscapeSyntaxError as escape_error:
+                    if self.is_inside_box:
+                        # Follow-on symbol may be a escape character that can
+                        # appear only in box constructs, e.g. \%.
+                        break
                     self.feeder.message(
                         escape_error.name, escape_error.tag, escape_error.args
                     )
@@ -789,7 +813,6 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
                 if escape_str in _letterlikes + _letters + "0123456789$":
                     text += escape_str
                     self.pos = next_pos
-                    # Look to extend the symbol for further
                 else:
                     break
 
@@ -830,7 +853,7 @@ def t_String(self, _: re.Match) -> Token:
                     escape_str, self.pos = parse_escape_sequence(source_text, self.pos)
                 except EscapeSyntaxError as escape_error:
                     self.feeder.message(
-                        escape_error.name, escape_error.tag, escape_error.args
+                        escape_error.name, escape_error.tag, *escape_error.args
                     )
                     raise
 

From 17192927523c7d938b61a67f092361e2a9f0365e Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Mon, 19 May 2025 12:02:03 -0400
Subject: [PATCH 18/34] Better Symbol-name extension test...

for things like \.78\.79

Imporve comments around DRYing identifier/symbol_name extension
---
 mathics_scanner/tokeniser.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 06fe9545..1cee58ab 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -607,7 +607,13 @@ def next(self) -> Token:
         text = pattern_match.group(0)
         self.pos = pattern_match.end(0)
 
-        # FIXME: DRY with code in RawBackslash
+        # The below similar to what we do in t_RawBackslash, but is is
+        # different.  First, we need to look for a closing quote
+        # ("). Also, after parsing escape sequences, we can
+        # unconditionallhy add them on to the string. That is, we
+        # don't have to check whether the returned string can be valid
+        # in a Symbol name.
+
         if tag == "Symbol":
             # We have to keep searching for the end of the Symbol if
             # the next symbol is a backslash, "\", because it might be a
@@ -741,7 +747,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
             self.feeder.message(escape_error.name, escape_error.tag, *escape_error.args)
             raise
 
-        # Is there a way to DRY with "next()?"
+        # Is there a way to DRY with "next()?
 
         if named_character != "":
             if named_character in NO_MEANING_OPERATORS:
@@ -771,6 +777,9 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
 
         text = pattern_match.group(0)
 
+        # Is there a way to DRY with t_String?"
+        # See t_String for differences.
+
         if tag == "Symbol":
             # FIXME: DRY with code in next()
             # We have to keep searching for the end of the Symbol
@@ -787,7 +796,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
                 # TODO: Do we need to add context breaks? And if so,
                 # do we need to check for consecutive ``'s?
                 alphanumeric_match = re.match(
-                    f"[0-9${symbol_first_letter}]+", self.source_text[self.pos :]
+                    f"[0-9${symbol_first_letter}]+", source_text[self.pos :]
                 )
                 if alphanumeric_match is not None:
                     extension_str = alphanumeric_match.group(0)
@@ -810,7 +819,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
                         escape_error.name, escape_error.tag, escape_error.args
                     )
                     raise
-                if escape_str in _letterlikes + _letters + "0123456789$":
+                if re.match(base_symbol_pattern, escape_str):
                     text += escape_str
                     self.pos = next_pos
                 else:
@@ -828,8 +837,16 @@ def t_String(self, _: re.Match) -> Token:
         newlines = []
         source_text = self.source_text
         result = ""
+
+        # The below similar to what we do in t_RawBackslash, but is is
+        # different.  First, we need to look for a closing quote
+        # ("). Also, after parsing escape sequences, we can
+        # unconditionallhy add them on to the string. That is, we
+        # don't have to check whether the returned string can be valid
+        # in a Symbol name.
+
         while True:
-            if self.pos >= len(self.source_text):
+            if self.pos >= len(source_text):
                 if end is None:
                     # reached end while still inside string
                     self.get_more_input()

From 42a3e8d1e025d7f5dcbadc466b13ba8fe6a473cd Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Tue, 20 May 2025 17:30:22 -0400
Subject: [PATCH 19/34] WIP - small tweaks before moving master forward

This PR has gotten out of hand in size, we'll break it up into smaller chunks.
---
 mathics_scanner/escape_sequences.py |  4 +-
 mathics_scanner/tokeniser.py        |  9 ++-
 test/test_prescanner.py             | 96 -----------------------------
 3 files changed, 9 insertions(+), 100 deletions(-)
 delete mode 100644 test/test_prescanner.py

diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py
index 61c56b28..2f86aed0 100644
--- a/mathics_scanner/escape_sequences.py
+++ b/mathics_scanner/escape_sequences.py
@@ -131,9 +131,9 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]:
             result += "\b"
         elif c == "f":
             result += "\f"
-        elif c == "$":
+        elif c in '$"':
             # I don't know why \$ is defined, but it is!
-            result += r"\$"
+            result += rf"\{c}"
         else:
             assert c == "r"
             result += "\r"
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 1cee58ab..3930ca84 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -776,6 +776,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
             raise ScanError(tag, pre, post)
 
         text = pattern_match.group(0)
+        start_pos = pattern_match.start(0)
 
         # Is there a way to DRY with t_String?"
         # See t_String for differences.
@@ -819,13 +820,17 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
                         escape_error.name, escape_error.tag, escape_error.args
                     )
                     raise
-                if re.match(base_symbol_pattern, escape_str):
+                if re.match(interior_symbol_pattern, escape_str):
                     text += escape_str
                     self.pos = next_pos
                 else:
                     break
 
-        return Token(tag, text, pattern_match.start(0))
+        elif tag == "String":
+            self.feeder.message("Syntax", "sntxi", text)
+            raise IncompleteSyntaxError("Syntax", "sntxi", text)
+
+        return Token(tag, text, start_pos)
 
     def t_String(self, _: re.Match) -> Token:
         """Break out from self.source_text the next token which is expected to be a String.
diff --git a/test/test_prescanner.py b/test/test_prescanner.py
deleted file mode 100644
index 25163d26..00000000
--- a/test/test_prescanner.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# -*- coding: utf-8 -*-
-import pytest
-
-from mathics_scanner import IncompleteSyntaxError, ScanError
-from mathics_scanner.feed import SingleLineFeeder
-from mathics_scanner.prescanner import Prescanner
-
-
-def replace_escape_sequences(mathics_text: str):
-    prescanner = Prescanner(SingleLineFeeder(mathics_text))
-    return prescanner.replace_escape_sequences()
-
-
-def assert_invalid(mathics_text: str):
-    with pytest.raises(ScanError):
-        replace_escape_sequences(mathics_text)
-
-
-def assert_incomplete(mathics_text: str):
-    with pytest.raises(IncompleteSyntaxError):
-        replace_escape_sequences(mathics_text)
-
-
-def assert_equal(mathics_text: str, result: str):
-    assert replace_escape_sequences(mathics_text) == result
-
-
-def assert_equal_length(mathics_text: str, length):
-    assert len(replace_escape_sequences(mathics_text)) == length
-
-
-@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner")
-def test_named_characters():
-    assert_equal(r"\[Theta]", "\u03B8")
-    assert_equal(r"\[CapitalPi]", "\u03A0")
-    assert_equal(r"\[Fake]", r"\[Fake]")
-    assert_equal("z \\[Conjugate]", "z \uF3C8")
-    assert_equal("z \\[Integral]", "z \u222b")
-    assert_equal("z \\\\[Integral]", "z \\\\[Integral]")
-    assert_equal("z \\\\\\[Integral]", "z \\\\\u222b")
-    assert_equal("abc\\\\", "abc\\\\")
-
-
-@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner")
-def test_text_lengths():
-    assert_equal_length(r'"\[Integral]"', 3)
-    # Prescanner keep both slashes and quotes.
-    # The tokenizer brings \\ into \ if it appears
-    # inside a string.
-    assert_equal_length(r'"\\[Integral]"', 14)
-
-
-@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner")
-def test_incomplete():
-    assert_incomplete(r"\[")
-    assert_incomplete(r"\[Theta")
-
-
-@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner")
-def test_invalid_octal():
-    assert_invalid(r"\093")
-    assert_invalid(r"\01")
-
-
-@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner")
-def test_invalid_colon():
-    assert_invalid(r"\:")
-    assert_invalid(r"\:A")
-    assert_invalid(r"\:01")
-    assert_invalid(r"\:A1")
-    assert_invalid(r"\:ak")
-    assert_invalid(r"\:A10")
-    assert_invalid(r"\:a1g")
-    assert_invalid(r"\:A1g9")
-    assert_invalid(r"\:01-2")
-
-
-@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner")
-def test_invalid_dot():
-    assert_invalid(r"\.")
-    assert_invalid(r"\.0")
-
-
-@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner")
-def test_combined():
-    assert_equal(r"\:03B8\[Theta]\.30\052", "\u03B8\u03B80*")
-
-
-@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner")
-def test_nested():
-    assert_equal(r"\[Thet\141]", r"\[Thet\141]")
-
-
-@pytest.mark.skip("Prescanner tests need to be integrated outside of prescanner")
-def test_trailing_backslash():
-    assert_incomplete("x \\")

From 9c596becdcca0cb2639c395decd5d4c1a5196dd0 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Thu, 29 May 2025 12:19:30 -0400
Subject: [PATCH 20/34] Small bugs related to escape-character handling

NamedChracterSyntax should be a new-style TranslateError
self.code -> self.source_text
misc sntx_message() fixes. Document better.
---
 mathics_scanner/errors.py    |  2 +-
 mathics_scanner/feed.py      |  6 ++--
 mathics_scanner/tokeniser.py | 69 ++++++++++++++++++++++--------------
 3 files changed, 46 insertions(+), 31 deletions(-)

diff --git a/mathics_scanner/errors.py b/mathics_scanner/errors.py
index 212a82e3..9210878c 100644
--- a/mathics_scanner/errors.py
+++ b/mathics_scanner/errors.py
@@ -34,7 +34,7 @@ class InvalidSyntaxError(TranslateErrorNew):
     pass
 
 
-class NamedCharacterSyntaxError(TranslateError):
+class NamedCharacterSyntaxError(TranslateErrorNew):
     """Named character syntax error"""
 
     pass
diff --git a/mathics_scanner/feed.py b/mathics_scanner/feed.py
index 8ae0717f..9674ae53 100644
--- a/mathics_scanner/feed.py
+++ b/mathics_scanner/feed.py
@@ -130,14 +130,14 @@ def empty(self) -> bool:
 class SingleLineFeeder(LineFeeder):
     "A feeder that feeds all the code as a single line."
 
-    def __init__(self, code: str, filename=""):
+    def __init__(self, source_text: str, filename=""):
         """
         :param code: The source of the feeder (a string).
         :param filename: A string that describes the source of the feeder, i.e.
                          the filename that is being feed.
         """
         super().__init__(filename)
-        self.code = code
+        self.source_text = source_text
         self._empty = False
 
     def feed(self) -> str:
@@ -145,7 +145,7 @@ def feed(self) -> str:
             return ""
         self._empty = True
         self.lineno += 1
-        return self.code
+        return self.source_text
 
     def empty(self) -> bool:
         return self._empty
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 3930ca84..c37ca71c 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -11,7 +11,13 @@
 from typing import Dict, List, Optional, Tuple
 
 from mathics_scanner.characters import _letterlikes, _letters
-from mathics_scanner.errors import EscapeSyntaxError, IncompleteSyntaxError, ScanError
+from mathics_scanner.errors import (
+    EscapeSyntaxError,
+    IncompleteSyntaxError,
+    InvalidSyntaxError,
+    NamedCharacterSyntaxError,
+    ScanError,
+)
 from mathics_scanner.escape_sequences import parse_escape_sequence
 
 try:
@@ -553,19 +559,29 @@ def is_inside_box(self) -> bool:
     def is_inside_box(self, value: bool) -> None:
         self._is_inside_box = value
 
-    def sntx_message(self, pos: Optional[int] = None) -> Tuple[str, str, str]:
-        """
-        Send a "sntx{b,f} error message to the input-reading feeder.
+    def sntx_message(self, start_pos: Optional[int] = None) -> Tuple[str, int, int]:
+        """Send a "sntx{b,f} error message to the input-reading
+        feeder.
+
+        The tag ("sntxb" or "sntxf"), position of the error, and blank-stripped
+        position to the end line are returned.
         """
-        if pos is None:
-            pos = self.pos
-        pre, post = self.source_text[:pos], self.source_text[pos:].rstrip("\n")
-        if pos == 0:
-            self.feeder.message("Syntax", "sntxb", pre, post)
-            return "sntxb", pre, post
+        if start_pos is None:
+            start_pos = self.pos
+        trailing_fragment = self.source_text[start_pos:].strip()
+        end_pos = start_pos + len(trailing_fragment)
+        if start_pos == 0:
+            self.feeder.message("Syntax", "sntxb", trailing_fragment)
+            tag = "sntxb"
         else:
-            self.feeder.message("Syntax", "sntxf", pre, post)
-            return "sntxf", pre, post
+            self.feeder.message(
+                "Syntax",
+                "sntxf",
+                self.source_text[:start_pos].strip(),
+                trailing_fragment,
+            )
+            tag = "syntx"
+        return tag, start_pos, end_pos
 
     # TODO: If this is converted this to __next__, then
     # a tokeniser object is iterable.
@@ -573,6 +589,7 @@ def next(self) -> Token:
         "Returns the next token from self.source_text."
         self._skip_blank()
         source_text = self.source_text
+
         if self.pos >= len(source_text):
             return Token("END", "", len(source_text))
 
@@ -645,7 +662,7 @@ def next(self) -> Token:
                     escape_str, next_pos = parse_escape_sequence(
                         self.source_text, self.pos + 1
                     )
-                except EscapeSyntaxError as escape_error:
+                except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error:
                     if self.is_inside_box:
                         # Follow-on symbol may be a escape character that can
                         # appear only in box constructs, e.g. \%.
@@ -739,16 +756,16 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
             self.get_more_input()
             self.pos += 1
             source_text += self.source_text
+
         try:
             escape_str, self.pos = parse_escape_sequence(source_text, start_pos)
             if source_text[start_pos] == "[" and source_text[self.pos - 1] == "]":
                 named_character = source_text[start_pos + 1 : self.pos - 1]
-        except EscapeSyntaxError as escape_error:
+        except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error:
             self.feeder.message(escape_error.name, escape_error.tag, *escape_error.args)
             raise
 
         # Is there a way to DRY with "next()?
-
         if named_character != "":
             if named_character in NO_MEANING_OPERATORS:
                 return Token(named_character, escape_str, start_pos - 1)
@@ -811,7 +828,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
                     escape_str, next_pos = parse_escape_sequence(
                         self.source_text, self.pos + 1
                     )
-                except EscapeSyntaxError as escape_error:
+                except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error:
                     if self.is_inside_box:
                         # Follow-on symbol may be a escape character that can
                         # appear only in box constructs, e.g. \%.
@@ -828,16 +845,16 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
 
         elif tag == "String":
             self.feeder.message("Syntax", "sntxi", text)
-            raise IncompleteSyntaxError("Syntax", "sntxi", text)
+            raise InvalidSyntaxError("Syntax", "sntxi", text)
 
         return Token(tag, text, start_pos)
 
-    def t_String(self, _: re.Match) -> Token:
+    def t_String(self, _: Optional[re.Match]) -> Token:
         """Break out from self.source_text the next token which is expected to be a String.
         The string value of the returned token will have double quote (") in the first and last
         postions of the returned string.
         """
-        start, end = self.pos, None
+        end = None
         self.pos += 1  # skip opening '"'
         newlines = []
         source_text = self.source_text
@@ -846,7 +863,7 @@ def t_String(self, _: re.Match) -> Token:
         # The below similar to what we do in t_RawBackslash, but is is
         # different.  First, we need to look for a closing quote
         # ("). Also, after parsing escape sequences, we can
-        # unconditionallhy add them on to the string. That is, we
+        # unconditionally add them on to the string. That is, we
         # don't have to check whether the returned string can be valid
         # in a Symbol name.
 
@@ -873,7 +890,7 @@ def t_String(self, _: re.Match) -> Token:
                 self.pos += 1
                 try:
                     escape_str, self.pos = parse_escape_sequence(source_text, self.pos)
-                except EscapeSyntaxError as escape_error:
+                except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error:
                     self.feeder.message(
                         escape_error.name, escape_error.tag, *escape_error.args
                     )
@@ -884,12 +901,10 @@ def t_String(self, _: re.Match) -> Token:
                 result += self.source_text[self.pos]
                 self.pos += 1
 
-        indices = [start] + newlines + [end]
-        result = "".join(
-            self.source_text[indices[i] : indices[i + 1]]
-            for i in range(len(indices) - 1)
-        )
-        return Token("String", result, start)
+        # FIXME: rethink whether we really need quotes at the beginning and
+        # and of a string and redo. This will include revising whatever calls
+        # parser.unescape string().
+        return Token("String", f'"{result}"', self.pos)
 
 
 # Call the function that initializes the dictionaries.

From 74587ccde9ec19640e84aa02c563b1c4e2a3d42c Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Thu, 29 May 2025 12:47:57 -0400
Subject: [PATCH 21/34] Use git branch for testing Mathics

---
 .github/workflows/mathics.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mathics.yml b/.github/workflows/mathics.yml
index 899075c6..cca93e00 100644
--- a/.github/workflows/mathics.yml
+++ b/.github/workflows/mathics.yml
@@ -33,7 +33,7 @@ jobs:
         git clone --depth 1 https://github.com/Mathics3/mathics-scanner.git
         (cd mathics-scanner && pip install -e .)
         # Until next Mathics3/mathics-core release is out...
-        git clone --depth 1 https://github.com/Mathics3/mathics-core.git
+        git clone --depth 1 --branch revise-escape-sequence-scanning https://github.com/Mathics3/mathics-core.git
         cd mathics-core/
         make PIP_INSTALL_OPTS='[full]'
         # pip install Mathics3[full]

From 25f56720f8ac88278d83523962f9993b45c3bbd4 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Thu, 29 May 2025 17:09:55 -0400
Subject: [PATCH 22/34] Revise Scanner error exception class

TranslateError, TranslateErrorNew, ScanError now become ScannerError
---
 docs/source/api.rst                 |  2 +-
 mathics_scanner/__init__.py         |  8 ++------
 mathics_scanner/errors.py           | 29 +++++++++++------------------
 mathics_scanner/escape_sequences.py |  8 ++++----
 mathics_scanner/mathics3_tokens.py  | 23 ++++++++++++++---------
 mathics_scanner/tokeniser.py        |  6 +++---
 test/test_escape_sequences.py       |  4 ++--
 test/test_tokeniser.py              | 12 ++++++++----
 8 files changed, 45 insertions(+), 47 deletions(-)

diff --git a/docs/source/api.rst b/docs/source/api.rst
index 7368a54b..c651554d 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -10,7 +10,7 @@ Tokenization
 
 Tokenization is performed by the ``Tokeniser`` class. The ``next`` method
 consumes characters from a feeder and returns a token if the tokenization
-succeeds. If the tokenization fails an instance of ``TranslateError`` is
+succeeds. If the tokenization fails an instance of ``ScannerError`` is
 raised.
 
 .. autoclass:: Tokeniser(object)
diff --git a/mathics_scanner/__init__.py b/mathics_scanner/__init__.py
index 5d9def81..af14255f 100644
--- a/mathics_scanner/__init__.py
+++ b/mathics_scanner/__init__.py
@@ -15,9 +15,7 @@
 from mathics_scanner.errors import (
     IncompleteSyntaxError,
     InvalidSyntaxError,
-    ScanError,
-    TranslateError,
-    TranslateErrorNew,
+    ScannerError,
 )
 from mathics_scanner.feed import (
     FileLineFeeder,
@@ -36,12 +34,10 @@
     "InvalidSyntaxError",
     "LineFeeder",
     "MultiLineFeeder",
-    "ScanError",
+    "ScannerError",
     "SingleLineFeeder",
     # "Token",
     # "Tokeniser",
-    "TranslateError",
-    "TranslateErrorNew",
     "__version__",
     "aliased_characters",
     # "is_symbol_name",
diff --git a/mathics_scanner/errors.py b/mathics_scanner/errors.py
index 9210878c..13e8e1eb 100644
--- a/mathics_scanner/errors.py
+++ b/mathics_scanner/errors.py
@@ -1,7 +1,13 @@
 # -*- coding: utf-8 -*-
 
 
-class TranslateErrorNew(Exception):
+class ScannerError(Exception):
+    """Some sort of error in the scanning or tokenization phase parsing Mathics3.
+
+    There are more specific kinds of exceptions subclassed from this
+    exception class.
+    """
+
     def __init__(self, tag: str, *args):
         super().__init__()
         self.name = "Syntax"
@@ -9,38 +15,25 @@ def __init__(self, tag: str, *args):
         self.args = args
 
 
-class TranslateError(Exception):
-    """
-    A generic class of tokenization errors. This exception is subclassed by other
-    tokenization errors
-    """
-
-
-class EscapeSyntaxError(TranslateErrorNew):
+class EscapeSyntaxError(ScannerError):
     """Escape sequence syntax error"""
 
     pass
 
 
-class IncompleteSyntaxError(TranslateErrorNew):
+class IncompleteSyntaxError(ScannerError):
     """More characters were expected to form a valid token"""
 
     pass
 
 
-class InvalidSyntaxError(TranslateErrorNew):
+class InvalidSyntaxError(ScannerError):
     """Invalid syntax"""
 
     pass
 
 
-class NamedCharacterSyntaxError(TranslateErrorNew):
+class NamedCharacterSyntaxError(EscapeSyntaxError):
     """Named character syntax error"""
 
     pass
-
-
-class ScanError(TranslateErrorNew):
-    """A generic scanning error"""
-
-    pass
diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py
index 2f86aed0..bfd8f2f9 100644
--- a/mathics_scanner/escape_sequences.py
+++ b/mathics_scanner/escape_sequences.py
@@ -8,7 +8,7 @@
 from mathics_scanner.errors import (
     EscapeSyntaxError,
     NamedCharacterSyntaxError,
-    ScanError,
+    ScannerError,
 )
 
 
@@ -19,7 +19,7 @@ def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) ->
 
     If so, chr(integer value converted from base) is returnd.
 
-    However, if the conversion fails, ScanError is raised.
+    However, if the conversion fails, ScannerError is raised.
     """
     last = end_shift - start_shift
     if last == 2:
@@ -33,14 +33,14 @@ def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) ->
         raise ValueError()
 
     if end_shift > len(source_text):
-        raise ScanError("Syntax", tag)
+        raise ScannerError("Syntax", tag)
 
     assert start_shift <= end_shift
     text = source_text[start_shift:end_shift]
     try:
         result = int(text, base)
     except ValueError:
-        raise ScanError(tag, source_text[start_shift:].rstrip("\n"))
+        raise ScannerError(tag, source_text[start_shift:].rstrip("\n"))
 
     return chr(result)
 
diff --git a/mathics_scanner/mathics3_tokens.py b/mathics_scanner/mathics3_tokens.py
index db6a163c..8403e95e 100644
--- a/mathics_scanner/mathics3_tokens.py
+++ b/mathics_scanner/mathics3_tokens.py
@@ -10,7 +10,7 @@
 from mathics_scanner.errors import (
     EscapeSyntaxError,
     NamedCharacterSyntaxError,
-    ScanError,
+    ScannerError,
 )
 from mathics_scanner.feed import FileLineFeeder, LineFeeder, SingleLineFeeder
 from mathics_scanner.tokeniser import Tokeniser
@@ -162,25 +162,30 @@ def interactive_eval_loop(shell: TerminalShell, code_tokenize_format: bool):
         try:
             source_text = shell.feed()
             tokens(source_text, code_tokenize_format)
-        except ScanError:
-            shell.errmsg(
-                "Syntax",
-                "sntxi",
-                "Expression error",
-            )
-            pass
         except NamedCharacterSyntaxError:
             shell.errmsg(
                 "Syntax",
                 "sntufn",
                 "Unknown unicode longname",
             )
+        # This has to come after NamedCharacterSyntaxError
+        # since that is a subclass EscapeSyntaxError
         except EscapeSyntaxError:
             shell.errmsg(
                 "Syntax",
                 "sntufn",
                 "Unknown unicode longname",
             )
+        # This has to come after NamedCharacterSyntaxError and
+        # EscapeSyntaxError since those are subclasses of
+        # ScannerError
+        except ScannerError:
+            shell.errmsg(
+                "Syntax",
+                "sntxi",
+                "Expression error",
+            )
+            pass
         except KeyboardInterrupt:
             print("\nKeyboardInterrupt. Type Ctrl-D (EOF) to exit.")
         except EOFError:
@@ -199,7 +204,7 @@ def tokens(code, code_tokenize_format: bool):
     while True:
         try:
             token = tokeniser.next()
-        except ScanError as scan_error:
+        except ScannerError as scan_error:
             mess = ""
             if scan_error.tag == "sntoct1":
                 mess = r"3 octal digits are required after \ to construct an 8-bit character"
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index c37ca71c..51385708 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -16,7 +16,7 @@
     IncompleteSyntaxError,
     InvalidSyntaxError,
     NamedCharacterSyntaxError,
-    ScanError,
+    ScannerError,
 )
 from mathics_scanner.escape_sequences import parse_escape_sequence
 
@@ -612,7 +612,7 @@ def next(self) -> Token:
         # No matching pattern found.
         if pattern_match is None:
             tag, pre_str, post_str = self.sntx_message()
-            raise ScanError(tag, pre_str, post_str)
+            raise ScannerError(tag, pre_str, post_str)
 
         # Look for custom tokenization rules; those are defined with t_tag.
         override = getattr(self, "t_" + tag, None)
@@ -790,7 +790,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
         # No matching found.
         if pattern_match is None:
             tag, pre, post = self.sntx_message()
-            raise ScanError(tag, pre, post)
+            raise ScannerError(tag, pre, post)
 
         text = pattern_match.group(0)
         start_pos = pattern_match.start(0)
diff --git a/test/test_escape_sequences.py b/test/test_escape_sequences.py
index 1547d066..c164b60f 100644
--- a/test/test_escape_sequences.py
+++ b/test/test_escape_sequences.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 import pytest
 
-from mathics_scanner.errors import NamedCharacterSyntaxError, ScanError
+from mathics_scanner.errors import NamedCharacterSyntaxError, ScannerError
 from mathics_scanner.escape_sequences import parse_escape_sequence
 
 
@@ -66,5 +66,5 @@ def test_invalid_number_encoding():
         ":A1g9",
         ":01-2",
     ):
-        with pytest.raises(ScanError):
+        with pytest.raises(ScannerError):
             parse_escape_sequence(text, 0)
diff --git a/test/test_tokeniser.py b/test/test_tokeniser.py
index 96ceee8d..f118d2bb 100644
--- a/test/test_tokeniser.py
+++ b/test/test_tokeniser.py
@@ -9,7 +9,11 @@
 
 import pytest
 
-from mathics_scanner.errors import IncompleteSyntaxError, InvalidSyntaxError, ScanError
+from mathics_scanner.errors import (
+    IncompleteSyntaxError,
+    InvalidSyntaxError,
+    ScannerError,
+)
 from mathics_scanner.feed import SingleLineFeeder
 from mathics_scanner.tokeniser import Token, Tokeniser, is_symbol_name
 
@@ -34,8 +38,8 @@ def invalid_error(error_message: str):
         tokens(error_message)
 
 
-def scan_error(error_message):
-    with pytest.raises(ScanError):
+def scanner_error(error_message):
+    with pytest.raises(ScannerError):
         tokens(error_message)
 
 
@@ -137,7 +141,7 @@ def test_is_symbol():
 
 
 def test_accuracy():
-    scan_error("1.5``")
+    scanner_error("1.5``")
     check_number("1.0``20")
     check_number("1.0``0")
     check_number("1.4``-20")

From e503b3aad7cb5b54979841081cc754b782ee9e4d Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Thu, 29 May 2025 17:18:24 -0400
Subject: [PATCH 23/34] Let's use 3.12 in CI testing

it should be just a little bit faster (and it is more modern)
---
 .github/workflows/mathics.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mathics.yml b/.github/workflows/mathics.yml
index cca93e00..17c13e6e 100644
--- a/.github/workflows/mathics.yml
+++ b/.github/workflows/mathics.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.11']
+        python-version: ['3.12']
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}

From e1b27faaa54f0530dbaa1b74e3898c087f27d287 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Thu, 29 May 2025 17:38:51 -0400
Subject: [PATCH 24/34] Small tidying changes to comments

---
 mathics_scanner/tokeniser.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 51385708..f84b9cb8 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -459,7 +459,9 @@ def is_symbol_name(text: str) -> bool:
 class Token:
     """A representation of a Wolfram-Language token.
 
-    Tokens are parsed by the parser; and are used to build M-expressions.
+    A Token is the next level of parsing abstraction above a raw input
+    Mathics input string. A sequence of tokens is the input for the
+    Mathics3 parser.
 
     A token has a `tag`, the class or type of the token. For example:
     a Number, Symbol, String, File, etc.
@@ -746,7 +748,7 @@ def t_PutAppend(self, pattern_match: re.Match) -> Token:
         return self._token_mode(pattern_match, "PutAppend", "filename")
 
     def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
-        """Break out from ``pattern_match`` tokens which start with \\"""
+        r"""Break out from ``pattern_match`` tokens which start with a backslash, '\'."""
         source_text = self.source_text
         start_pos = self.pos + 1
         named_character = ""

From c440e427e7716003cfb7f51227c038cacab58629 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Thu, 29 May 2025 18:23:46 -0400
Subject: [PATCH 25/34] ScannerError -> SyntaxError

Use more direct and simpler error class name that is is more like its
other subclassed errors.
---
 docs/source/api.rst                 | 2 +-
 mathics_scanner/__init__.py         | 4 ++--
 mathics_scanner/errors.py           | 8 ++++----
 mathics_scanner/escape_sequences.py | 8 ++++----
 mathics_scanner/mathics3_tokens.py  | 8 ++++----
 mathics_scanner/tokeniser.py        | 6 +++---
 test/test_escape_sequences.py       | 4 ++--
 test/test_tokeniser.py              | 4 ++--
 8 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/docs/source/api.rst b/docs/source/api.rst
index c651554d..cb02ec89 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -10,7 +10,7 @@ Tokenization
 
 Tokenization is performed by the ``Tokeniser`` class. The ``next`` method
 consumes characters from a feeder and returns a token if the tokenization
-succeeds. If the tokenization fails an instance of ``ScannerError`` is
+succeeds. If the tokenization fails an instance of ``SyntaxError`` is
 raised.
 
 .. autoclass:: Tokeniser(object)
diff --git a/mathics_scanner/__init__.py b/mathics_scanner/__init__.py
index af14255f..ecfec276 100644
--- a/mathics_scanner/__init__.py
+++ b/mathics_scanner/__init__.py
@@ -15,7 +15,7 @@
 from mathics_scanner.errors import (
     IncompleteSyntaxError,
     InvalidSyntaxError,
-    ScannerError,
+    SyntaxError,
 )
 from mathics_scanner.feed import (
     FileLineFeeder,
@@ -34,7 +34,7 @@
     "InvalidSyntaxError",
     "LineFeeder",
     "MultiLineFeeder",
-    "ScannerError",
+    "SyntaxError",
     "SingleLineFeeder",
     # "Token",
     # "Tokeniser",
diff --git a/mathics_scanner/errors.py b/mathics_scanner/errors.py
index 13e8e1eb..98b9c169 100644
--- a/mathics_scanner/errors.py
+++ b/mathics_scanner/errors.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 
-class ScannerError(Exception):
+class SyntaxError(Exception):
     """Some sort of error in the scanning or tokenization phase parsing Mathics3.
 
     There are more specific kinds of exceptions subclassed from this
@@ -15,19 +15,19 @@ def __init__(self, tag: str, *args):
         self.args = args
 
 
-class EscapeSyntaxError(ScannerError):
+class EscapeSyntaxError(SyntaxError):
     """Escape sequence syntax error"""
 
     pass
 
 
-class IncompleteSyntaxError(ScannerError):
+class IncompleteSyntaxError(SyntaxError):
     """More characters were expected to form a valid token"""
 
     pass
 
 
-class InvalidSyntaxError(ScannerError):
+class InvalidSyntaxError(SyntaxError):
     """Invalid syntax"""
 
     pass
diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py
index bfd8f2f9..44295027 100644
--- a/mathics_scanner/escape_sequences.py
+++ b/mathics_scanner/escape_sequences.py
@@ -8,7 +8,7 @@
 from mathics_scanner.errors import (
     EscapeSyntaxError,
     NamedCharacterSyntaxError,
-    ScannerError,
+    SyntaxError,
 )
 
 
@@ -19,7 +19,7 @@ def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) ->
 
     If so, chr(integer value converted from base) is returnd.
 
-    However, if the conversion fails, ScannerError is raised.
+    However, if the conversion fails, SyntaxError is raised.
     """
     last = end_shift - start_shift
     if last == 2:
@@ -33,14 +33,14 @@ def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) ->
         raise ValueError()
 
     if end_shift > len(source_text):
-        raise ScannerError("Syntax", tag)
+        raise SyntaxError("Syntax", tag)
 
     assert start_shift <= end_shift
     text = source_text[start_shift:end_shift]
     try:
         result = int(text, base)
     except ValueError:
-        raise ScannerError(tag, source_text[start_shift:].rstrip("\n"))
+        raise SyntaxError(tag, source_text[start_shift:].rstrip("\n"))
 
     return chr(result)
 
diff --git a/mathics_scanner/mathics3_tokens.py b/mathics_scanner/mathics3_tokens.py
index 8403e95e..d6b0467a 100644
--- a/mathics_scanner/mathics3_tokens.py
+++ b/mathics_scanner/mathics3_tokens.py
@@ -10,7 +10,7 @@
 from mathics_scanner.errors import (
     EscapeSyntaxError,
     NamedCharacterSyntaxError,
-    ScannerError,
+    SyntaxError,
 )
 from mathics_scanner.feed import FileLineFeeder, LineFeeder, SingleLineFeeder
 from mathics_scanner.tokeniser import Tokeniser
@@ -178,8 +178,8 @@ def interactive_eval_loop(shell: TerminalShell, code_tokenize_format: bool):
             )
         # This has to come after NamedCharacterSyntaxError and
         # EscapeSyntaxError since those are subclasses of
-        # ScannerError
-        except ScannerError:
+        # SyntaxError
+        except SyntaxError:
             shell.errmsg(
                 "Syntax",
                 "sntxi",
@@ -204,7 +204,7 @@ def tokens(code, code_tokenize_format: bool):
     while True:
         try:
             token = tokeniser.next()
-        except ScannerError as scan_error:
+        except SyntaxError as scan_error:
             mess = ""
             if scan_error.tag == "sntoct1":
                 mess = r"3 octal digits are required after \ to construct an 8-bit character"
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index f84b9cb8..9d508aef 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -16,7 +16,7 @@
     IncompleteSyntaxError,
     InvalidSyntaxError,
     NamedCharacterSyntaxError,
-    ScannerError,
+    SyntaxError,
 )
 from mathics_scanner.escape_sequences import parse_escape_sequence
 
@@ -614,7 +614,7 @@ def next(self) -> Token:
         # No matching pattern found.
         if pattern_match is None:
             tag, pre_str, post_str = self.sntx_message()
-            raise ScannerError(tag, pre_str, post_str)
+            raise SyntaxError(tag, pre_str, post_str)
 
         # Look for custom tokenization rules; those are defined with t_tag.
         override = getattr(self, "t_" + tag, None)
@@ -792,7 +792,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
         # No matching found.
         if pattern_match is None:
             tag, pre, post = self.sntx_message()
-            raise ScannerError(tag, pre, post)
+            raise SyntaxError(tag, pre, post)
 
         text = pattern_match.group(0)
         start_pos = pattern_match.start(0)
diff --git a/test/test_escape_sequences.py b/test/test_escape_sequences.py
index c164b60f..5c1f6f9d 100644
--- a/test/test_escape_sequences.py
+++ b/test/test_escape_sequences.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 import pytest
 
-from mathics_scanner.errors import NamedCharacterSyntaxError, ScannerError
+from mathics_scanner.errors import NamedCharacterSyntaxError, SyntaxError
 from mathics_scanner.escape_sequences import parse_escape_sequence
 
 
@@ -66,5 +66,5 @@ def test_invalid_number_encoding():
         ":A1g9",
         ":01-2",
     ):
-        with pytest.raises(ScannerError):
+        with pytest.raises(SyntaxError):
             parse_escape_sequence(text, 0)
diff --git a/test/test_tokeniser.py b/test/test_tokeniser.py
index f118d2bb..11a6de06 100644
--- a/test/test_tokeniser.py
+++ b/test/test_tokeniser.py
@@ -12,7 +12,7 @@
 from mathics_scanner.errors import (
     IncompleteSyntaxError,
     InvalidSyntaxError,
-    ScannerError,
+    SyntaxError,
 )
 from mathics_scanner.feed import SingleLineFeeder
 from mathics_scanner.tokeniser import Token, Tokeniser, is_symbol_name
@@ -39,7 +39,7 @@ def invalid_error(error_message: str):
 
 
 def scanner_error(error_message):
-    with pytest.raises(ScannerError):
+    with pytest.raises(SyntaxError):
         tokens(error_message)
 
 

From 5fce8a0237b57868e6e88c20281c6ab07f45f125 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Thu, 29 May 2025 18:32:07 -0400
Subject: [PATCH 26/34] More tests

---
 test/test_tokeniser.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/test/test_tokeniser.py b/test/test_tokeniser.py
index 11a6de06..52537190 100644
--- a/test/test_tokeniser.py
+++ b/test/test_tokeniser.py
@@ -10,6 +10,7 @@
 import pytest
 
 from mathics_scanner.errors import (
+    EscapeSyntaxError,
     IncompleteSyntaxError,
     InvalidSyntaxError,
     SyntaxError,
@@ -28,6 +29,11 @@ def check_symbol(source_code: str):
     assert token, Token("Symbol", source_code, 0)
 
 
+def escape_syntax_error(error_message: str):
+    with pytest.raises(EscapeSyntaxError):
+        tokens(error_message)
+
+
 def incomplete_error(error_message: str):
     with pytest.raises(IncompleteSyntaxError):
         tokens(error_message)
@@ -94,12 +100,8 @@ def test_association():
     ]
 
 
-@pytest.mark.skip("Backslash needs to be hanndled outside of prescanner")
 def test_backslash():
-    assert tokens("\\[Backslash]") == [Token("Backslash", "\u2216", 0)]
-
-    assert tokens("\\ a") == [Token("RawBackslash", "\\", 0), Token("Symbol", "a", 2)]
-
+    assert tokens(r"\[Backslash]") == [Token("Backslash", "\u2216", 0)]
     incomplete_error("\\")
 
 

From a568063244af5b088a5c8a008fb802a27d677aba Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Thu, 29 May 2025 21:57:00 -0400
Subject: [PATCH 27/34] One more escape test

---
 test/test_escape_sequences.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test/test_escape_sequences.py b/test/test_escape_sequences.py
index 5c1f6f9d..2c0726c8 100644
--- a/test/test_escape_sequences.py
+++ b/test/test_escape_sequences.py
@@ -11,6 +11,14 @@ def test_escape_sequences():
         ("\\\\", 0, 1, "\\", "backslash"),
         ("abc \\\\", 5, 6, "\\", "backslash at end"),
         ("abc \\\\n", 5, 6, "\\", "backslash in middle"),
+        (
+            r"\
+abc",
+            1,
+            2,
+            "\n",
+            "backslashed at end of line",
+        ),
         # Octal
         (r"050", 0, 3, chr(0o50), "character at beginning"),
         (r"a\051", 2, 5, chr(0o51), "Octal character in middle"),

From 36d85a7a4b3ce7b5ecf102e62c62d60882d16cd8 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Sat, 31 May 2025 07:57:49 -0400
Subject: [PATCH 28/34] Allow escape space "\ " +  more string tests

---
 mathics_scanner/escape_sequences.py |  4 +-
 test/test_string_tokens.py          | 68 ++++++++++++++++++-----------
 2 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py
index 44295027..00b2cb1d 100644
--- a/mathics_scanner/escape_sequences.py
+++ b/mathics_scanner/escape_sequences.py
@@ -122,9 +122,11 @@ def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]:
     # Note that these are a similer to Python, but are different.
     # In particular, Python defines "\a" to be ^G (control G),
     # but in WMA, this is invalid.
-    elif c in "ntbfr$\n":
+    elif c in "ntbfr $\n":
         if c in "n\n":
             result += "\n"
+        elif c == " ":
+            result += " "
         elif c == "t":
             result += "\t"
         elif c == "b":
diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py
index e42a5bee..6686ec0c 100644
--- a/test/test_string_tokens.py
+++ b/test/test_string_tokens.py
@@ -12,10 +12,19 @@
 from mathics_scanner.tokeniser import Token, Tokeniser
 
 
-def check_string(source_text, expected_text: str, message: Optional[str] = ""):
+def check_string(
+    source_text,
+    expected_text: str,
+    message: Optional[str] = "",
+    expected_tag: Optional[str] = None,
+):
     token = single_token(source_text)
     assert token is not None
-    assert token.tag == "String"
+
+    if expected_tag is None:
+        expected_tag = "String"
+    assert token.tag == expected_tag
+
     if message:
         assert token.text == expected_text, message
     else:
@@ -36,7 +45,7 @@ def escape_scan_error(s: str, failure_msg: str):
     assert excinfo, failure_msg
 
 
-def single_token(source_text) -> Token:
+def single_token(source_text: str) -> Token:
     tokens = get_tokens(source_text)
     assert len(tokens) == 1
     token = tokens[0]
@@ -56,23 +65,24 @@ def get_tokens(source_text: str):
 
 
 def test_string():
-    # # Number conversions for binary, octal, hexadecimal
-    # check_string(r'"\\c"', '"\\c"', "escaped backslash at beginning of string")
-    # check_string(r'"a\\b"', r'"a\b"', "escaped backslash")
-    # check_string(r'"\102"', '"B"', "Octal number test")
-    # check_string(r'"q\.b4"', '"q´"')
-
-    # # All valid ASCII-like control escape sequences
-    # for escape_string in ("\b", "\f", "\n", "\r", "\t"):
-    #     check_string(f'"a{escape_string}"', f'"a{escape_string}"')
-
-    # check_string(r'"abc"', r'"abc"')
-    # check_string(r'"abc(*def*)"', r'"abc(*def*)"')
-    # # check_string(r'"a\"b\\c"', r'"a\\"b\c"')
-    # incomplete_error(r'"abc', "String does not have terminating quote")
-    # incomplete_error(r'"\"', "Unterminated escape sequence")
-    escape_scan_error(r'"a\g"', "Unknown string escape \\g")
+    # Number conversions for binary, octal, hexadecimal
+    check_string(r'"a\\b"', r'"a\b"', "escaped backslash in a string")
+    check_string(r'"\102"', '"B"', "Octal number test in a string")
+    check_string(r'"q\.b4"', '"q´"', "2-digit hexadecimal number in a string")
+
+    check_string(r'"\\c"', '"\\c"', "escaped backslash at beginning of string")
+
+    # All valid ASCII-like control escape sequences
+    for escape_string in ("\b", "\f", "\n", "\r", "\t"):
+        check_string(f'"a{escape_string}"', f'"a{escape_string}"')
+
+    check_string(r'"\ abc"', '" abc"', "Escaped space in a string is valid")
+    check_string(r'"abc(*def*)"', r'"abc(*def*)"')
+    # check_string(r'"a\"b\\c"', r'"a\\"b\c"')
+    incomplete_error(r'"abc', "String does not have terminating quote")
+    incomplete_error(r'"\"', "Unterminated escape sequence")
 
+    escape_scan_error(r'"a\g"', "Unknown string escape \\g")
     escape_scan_error(r'"a\X"', '"X" is not a valid escape character')
 
 
@@ -84,13 +94,17 @@ def test_octal():
     check_string(r'"a\050"', r'"a("', "Octal '(' in string")
     check_string(r'"a\051"', r'"a)"', "Octal ')' in string")
     check_string(r'"a\052"', r'"a*"', "Octal '*' in string")
-    # FIXME: add tests ouside of string
 
 
 def test_hexadecimal_dot():
     check_string(r'"\.30"', '"0"', "2-digit hexadecimal ASCII number 0")
     check_string(r'"\.42"', '"B"', "2-digit hexadecimal ASCII capital B")
-    # FIXME: add tests ouside of string
+    check_string(
+        r"\.42\.30",
+        "B0",
+        "hexademimal encoding of identifier in expression context",
+        "Symbol",
+    )
 
 
 def test_hexadecimal_colon():
@@ -101,13 +115,17 @@ def test_hexadecimal_colon():
     )
     check_string(
         r'"\:03b8"',
-        '"\u03B8"',
+        '"\u03b8"',
         "4-digit hexadecimal number test with lowercase alpha lettter",
     )
     check_string(r'"\:0030"', '"0"')
-    # FIXME:
-    # check_string(r"\:03b8", "\u03B8", "4-digit hexadecimal number test with lowercase alpha lettter")
+    check_string(
+        r"\:03b8",
+        "\u03b8",
+        "4-digit hexadecimal number test with lowercase alpha letter",
+        "Symbol",
+    )
 
 
 def test_hexadecimal_vbar():
-    check_string(r'"\|01D451"', '"\U0001D451"')
+    check_string(r'"\|01D451"', '"\U0001d451"')

From 00cbb48930dc0bc338cfe16f2c18a8c19716bb26 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Sat, 31 May 2025 09:31:06 -0400
Subject: [PATCH 29/34] Start unit test for comments

---
 test/test_string_tokens.py |  4 +++
 test/test_tokeniser.py     | 50 +++++++++++++++++++++++++-------------
 2 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py
index 6686ec0c..3e0a6de9 100644
--- a/test/test_string_tokens.py
+++ b/test/test_string_tokens.py
@@ -65,6 +65,10 @@ def get_tokens(source_text: str):
 
 
 def test_string():
+    # Plain strings
+    check_string('""', '""', "Null string")
+    check_string('"abc"', '"abc"', "Simple sequence")
+
     # Number conversions for binary, octal, hexadecimal
     check_string(r'"a\\b"', r'"a\b"', "escaped backslash in a string")
     check_string(r'"\102"', '"B"', "Octal number test in a string")
diff --git a/test/test_tokeniser.py b/test/test_tokeniser.py
index 52537190..46a96cb6 100644
--- a/test/test_tokeniser.py
+++ b/test/test_tokeniser.py
@@ -72,6 +72,13 @@ def tokens(source_code) -> List[Token]:
     return tokens
 
 
+def test_accuracy():
+    scanner_error("1.5``")
+    check_number("1.0``20")
+    check_number("1.0``0")
+    check_number("1.4``-20")
+
+
 def test_apply():
     assert tokens("f // x") == [
         Token("Symbol", "f", 0),
@@ -113,6 +120,30 @@ def test_boxes():
     ]
 
 
+def test_comments():
+    assert tokens("(**)") == [], "empty comment"
+    assert tokens("(**)1") == [
+        Token("Number", "1", 4)
+    ], "empty comment with trailing text"
+    assert tokens("1(*2*)") == [
+        Token("Number", "1", 0)
+    ], "empty comment with leading text"
+    assert tokens("1 (*2*)") == [
+        Token("Number", "1", 0)
+    ], "empty comment with leading text and space"
+    assert tokens("(* A (* nested comment *) *)") == [], "A nested comment"
+    assert tokens(r"(* A \[theta] *)") == [], "Comment with valid escape sequence"
+    assert tokens(r"(* A \[unknown] *)") == [], "Comment with invalid escape sequence"
+
+
+def test_function():
+    assert tokens("x&") == [Token("Symbol", "x", 0), Token("Function", "&", 1)]
+    assert tokens("x\uf4a1") == [
+        Token("Symbol", "x", 0),
+        Token("Function", "\uf4a1", 1),
+    ]
+
+
 def test_information():
     assert tokens("??Sin") == [Token("Information", "??", 0), Token("Symbol", "Sin", 2)]
 
@@ -129,8 +160,8 @@ def test_int_repeated():
 
 
 def test_integeral():
-    assert tokens("\u222B x \uf74c y") == [
-        Token("Integral", "\u222B", 0),
+    assert tokens("\u222b x \uf74c y") == [
+        Token("Integral", "\u222b", 0),
         Token("Symbol", "x", 2),
         Token("DifferentialD", "\uf74c", 4),
         Token("Symbol", "y", 6),
@@ -142,13 +173,6 @@ def test_is_symbol():
     assert not is_symbol_name("98")  # symbols can't start with numbers
 
 
-def test_accuracy():
-    scanner_error("1.5``")
-    check_number("1.0``20")
-    check_number("1.0``0")
-    check_number("1.4``-20")
-
-
 def test_number():
     assert tags("1.5") == ["Number"]
     assert tags("1.5*^10") == ["Number"]
@@ -227,11 +251,3 @@ def test_unset():
     assert tokens("= .") == [Token("Unset", "= .", 0)]
     assert tokens("=.5") == [Token("Set", "=", 0), Token("Number", ".5", 1)]
     assert tokens("= ..") == [Token("Set", "=", 0), Token("Repeated", "..", 2)]
-
-
-def test_function():
-    assert tokens("x&") == [Token("Symbol", "x", 0), Token("Function", "&", 1)]
-    assert tokens("x\uf4a1") == [
-        Token("Symbol", "x", 0),
-        Token("Function", "\uf4a1", 1),
-    ]

From 2422c6076e75f6d2c405d409277a34d0f1dbf8e4 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Sat, 31 May 2025 11:31:27 -0400
Subject: [PATCH 30/34] Fix a doc spelling typo + minor doc tweak

---
 mathics_scanner/escape_sequences.py | 2 +-
 mathics_scanner/tokeniser.py        | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py
index 00b2cb1d..633b9ec9 100644
--- a/mathics_scanner/escape_sequences.py
+++ b/mathics_scanner/escape_sequences.py
@@ -57,7 +57,7 @@ def parse_named_character(source_text: str, start: int, finish: int) -> Optional
     e.g. "Theta".  If we can match this, then we return the unicode equivalent from the
     `named_characters` map (which is read in from JSON but stored in a YAML file).
 
-    If we can't find the named character, rasie NamedCharacterSyntaxError.
+    If we can't find the named character, raise NamedCharacterSyntaxError.
     """
     named_character = source_text[start:finish]
     if named_character.isalpha():
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 9d508aef..54f58cbe 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -459,8 +459,8 @@ def is_symbol_name(text: str) -> bool:
 class Token:
     """A representation of a Wolfram-Language token.
 
-    A Token is the next level of parsing abstraction above a raw input
-    Mathics input string. A sequence of tokens is the input for the
+    A Token is the next level of parsing abstraction above a raw
+    Mathics3 input string. A sequence of tokens is the input for the
     Mathics3 parser.
 
     A token has a `tag`, the class or type of the token. For example:
@@ -469,8 +469,7 @@ class Token:
     The token's `text` is the string contents of the token.
 
     The token's `pos` is the integer starting offset where
-    `text` can be found inside the input string. The input string
-    is not part of the token though.
+    `text` can be found inside the full input string.
     """
 
     def __init__(self, tag: str, text: str, pos: int):

From 7582e6b0685f41a0c2bed596130ccd2255cfe2b8 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Sun, 1 Jun 2025 07:08:52 -0400
Subject: [PATCH 31/34] invalid escape sequences inside strings...

An invalid escape sequence inside a string, like "\(a \+\)" is not
an error. Instead the sequence the same, e.g "\(a \+\)".
---
 mathics_scanner/tokeniser.py | 13 +++++++++++--
 test/test_string_tokens.py   | 12 +++++++++---
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 54f58cbe..6c94c140 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -891,13 +891,22 @@ def t_String(self, _: Optional[re.Match]) -> Token:
                 self.pos += 1
                 try:
                     escape_str, self.pos = parse_escape_sequence(source_text, self.pos)
-                except (EscapeSyntaxError, NamedCharacterSyntaxError) as escape_error:
+                except NamedCharacterSyntaxError as escape_error:
                     self.feeder.message(
                         escape_error.name, escape_error.tag, *escape_error.args
                     )
                     raise
 
-                result += escape_str
+                # This has to come after NamedCharacterSyntaxError since
+                # that is a subclass of this
+                except EscapeSyntaxError:
+                    # If there is an invalid escape character inside a string,
+                    # we preserve what was given.
+                    result += "\\" + self.source_text[self.pos]
+                    self.pos += 1
+
+                else:
+                    result += escape_str
             else:
                 result += self.source_text[self.pos]
                 self.pos += 1
diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py
index 3e0a6de9..c9132a7e 100644
--- a/test/test_string_tokens.py
+++ b/test/test_string_tokens.py
@@ -82,12 +82,18 @@ def test_string():
 
     check_string(r'"\ abc"', '" abc"', "Escaped space in a string is valid")
     check_string(r'"abc(*def*)"', r'"abc(*def*)"')
-    # check_string(r'"a\"b\\c"', r'"a\\"b\c"')
+
+    check_string(
+        r'"\(a \+\)"',
+        r'"\(a \+\)"',
+        "Do not interpret, but preserve boxing inside a string",
+    )
+
     incomplete_error(r'"abc', "String does not have terminating quote")
     incomplete_error(r'"\"', "Unterminated escape sequence")
 
-    escape_scan_error(r'"a\g"', "Unknown string escape \\g")
-    escape_scan_error(r'"a\X"', '"X" is not a valid escape character')
+    # escape_scan_error(r'"a\g"', "Unknown string escape \\g")
+    # escape_scan_error(r'"a\X"', '"X" is not a valid escape character')
 
 
 # https://www.wolfram.com/language/12/networking-and-system-operations/use-the-full-range-of-unicode-characters.html

From a49e453398bd6b6985211e042afc94f065230df1 Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Sun, 1 Jun 2025 09:15:13 -0400
Subject: [PATCH 32/34] Escape sequences in strings, yet again...

If the escape sequenced in a string can be a boxing construct, then this
is not an error in the escape sequence. Otherwise, it is.

For example

"\(" is not an error in a string while "\g" is.

Yes, this a bit involved. But that's the way WA works.
---
 mathics_scanner/tokeniser.py | 65 ++++++++++++++++++++++++++++--------
 test/test_string_tokens.py   |  4 +--
 2 files changed, 54 insertions(+), 15 deletions(-)

diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 6c94c140..a3198ad8 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -8,7 +8,7 @@
 import os.path as osp
 import re
 import string
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Set, Tuple
 
 from mathics_scanner.characters import _letterlikes, _letters
 from mathics_scanner.errors import (
@@ -25,17 +25,38 @@
 except ImportError:
     import json as ujson  # type: ignore[no-redef]
 
-
-OPERATOR_DATA = {}
+# Where we get operator data...
 ROOT_DIR = osp.dirname(__file__)
 OPERATORS_TABLE_PATH = osp.join(ROOT_DIR, "data", "operators.json")
+
+##############################################
+# The below get initialized in by init_module()
+# from operator data
+##############################################
+OPERATOR_DATA = {}
 NO_MEANING_OPERATORS = {}
 
+# String of the final character of a "box-operators" value,
+# This is used in t_String for escape-sequence handling.
+BOXING_CONSTRUCT_SUFFIXES: Set[str] = {
+    "%",
+    "/",
+    "@",
+    "+",
+    "_",
+    "&",
+    "!",
+    "^",
+    "`",
+    "(",
+    ")",
+}
+
 FILENAME_TOKENS: List = []
 TOKENS: List[Tuple] = []
 TOKEN_INDICES: Dict = {}
 
-
+##############################################
 # special patterns
 NUMBER_PATTERN = r"""
 ( (?# Two possible forms depending on whether base is specified)
@@ -62,7 +83,7 @@
 #
 # This could still be done, but it would need to be integrated more
 # properly into the tokenization phase which takes into account
-# differents states or "modes" indicating the interior of comments,
+# different states or "modes" indicating the interior of comments,
 # strings, files, and Box-like constructs.
 
 # The leading character of a Symbol:
@@ -172,6 +193,12 @@ def init_module():
     with open(osp.join(OPERATORS_TABLE_PATH), "r", encoding="utf8") as operator_f:
         OPERATOR_DATA.update(ujson.load(operator_f))
 
+    global BOXING_CONSTRUCT_SUFFIXES
+
+    BOXING_CONSTRUCT_SUFFIXES = set(
+        [op_str[-1] for op_str in OPERATOR_DATA["box-operators"].values()]
+    ) | set([")", "("])
+
     global NO_MEANING_OPERATORS
     NO_MEANING_OPERATORS = (
         set(OPERATOR_DATA["no-meaning-infix-operators"].keys())
@@ -853,7 +880,7 @@ def t_RawBackslash(self, pattern_match: Optional[re.Match]) -> Token:
     def t_String(self, _: Optional[re.Match]) -> Token:
         """Break out from self.source_text the next token which is expected to be a String.
         The string value of the returned token will have double quote (") in the first and last
-        postions of the returned string.
+        positions of the returned string.
         """
         end = None
         self.pos += 1  # skip opening '"'
@@ -866,7 +893,7 @@ def t_String(self, _: Optional[re.Match]) -> Token:
         # ("). Also, after parsing escape sequences, we can
         # unconditionally add them on to the string. That is, we
         # don't have to check whether the returned string can be valid
-        # in a Symbol name.
+        # in a Symbol name or as a boxing construct
 
         while True:
             if self.pos >= len(source_text):
@@ -898,12 +925,24 @@ def t_String(self, _: Optional[re.Match]) -> Token:
                     raise
 
                 # This has to come after NamedCharacterSyntaxError since
-                # that is a subclass of this
-                except EscapeSyntaxError:
-                    # If there is an invalid escape character inside a string,
-                    # we preserve what was given.
-                    result += "\\" + self.source_text[self.pos]
-                    self.pos += 1
+                # that is a subclass of this.
+                except EscapeSyntaxError as escape_error:
+                    escaped_char = self.source_text[self.pos]
+                    if escaped_char in BOXING_CONSTRUCT_SUFFIXES:
+                        # If there is boxing construct matched, we
+                        # preserve what was given, but do not tokenize
+                        # the construct. "\(" remains "\(" and is not
+                        # turned into IntepretBox".
+                        result += "\\" + escaped_char
+                        self.pos += 1
+                    else:
+                        # Not something that can be a boxing construct.
+                        # So here, we'll report an error as we do with
+                        # NamedCharacterSyntaxError.
+                        self.feeder.message(
+                            escape_error.name, escape_error.tag, *escape_error.args
+                        )
+                        raise
 
                 else:
                     result += escape_str
diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py
index c9132a7e..193be610 100644
--- a/test/test_string_tokens.py
+++ b/test/test_string_tokens.py
@@ -92,8 +92,8 @@ def test_string():
     incomplete_error(r'"abc', "String does not have terminating quote")
     incomplete_error(r'"\"', "Unterminated escape sequence")
 
-    # escape_scan_error(r'"a\g"', "Unknown string escape \\g")
-    # escape_scan_error(r'"a\X"', '"X" is not a valid escape character')
+    escape_scan_error(r'"a\g"', "Unknown string escape \\g")
+    escape_scan_error(r'"a\X"', '"X" is not a valid escape character')
 
 
 # https://www.wolfram.com/language/12/networking-and-system-operations/use-the-full-range-of-unicode-characters.html

From 1d10b185edb6a59909f8af71ed884f212865f19d Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Sun, 1 Jun 2025 10:24:43 -0400
Subject: [PATCH 33/34] Add LineSeparator, and \*

Also, flatten values in box operators for BOXING_CONSTRUCT_SUFFIXES
---
 mathics_scanner/data/named-characters.yml | 12 +++++++++++-
 mathics_scanner/tokeniser.py              | 13 +++++++++++--
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/mathics_scanner/data/named-characters.yml b/mathics_scanner/data/named-characters.yml
index bcca13c2..e94d6c02 100644
--- a/mathics_scanner/data/named-characters.yml
+++ b/mathics_scanner/data/named-characters.yml
@@ -68,7 +68,7 @@
 #               the named character. If it is the same as unicode-equivalent
 #               it should be omitted
 #
-#   wl-unicode-name: The name of the character corresponding to `wl-unicode`, if it exists. If it is the same as unicode-equivalent-name it can be omitted.
+#   wl-unicode-name: The name of the character corresponding to `wl-unicode`, if it exists.
 #                    It will mentioned in Wolfram Language docs if it exists.
 #
 # Sources:
@@ -6628,6 +6628,16 @@ LightBulb:
   wl-reference: https://reference.wolfram.com/language/ref/character/LightBulb.html
   wl-unicode: "\uF723"
 
+LineSeparator:
+  has-unicode-inverse: false
+  is-letter-like: false
+  unicode-equivalent: "\u2028"
+  unicode-equivalent-name: LINE SEPARATOR
+  unicode-reference: https://www.compart.com/en/unicode/U+2028
+  wl-reference: https://reference.wolfram.com/language/ref/character/LineSeparator.html
+  wl-unicode: "\u2028"
+  wl-unicode-name: LINE SEPARATOR
+
 LongDash:
   esc-alias: --
   has-unicode-inverse: false
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index a3198ad8..1698ad4c 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -5,6 +5,7 @@
 See classes `Token` and `Tokeniser` .
 """
 
+import itertools
 import os.path as osp
 import re
 import string
@@ -38,6 +39,8 @@
 
 # String of the final character of a "box-operators" value,
 # This is used in t_String for escape-sequence handling.
+# The below is roughly correct, but we overwrite this
+# from operators.json data in init_module()
 BOXING_CONSTRUCT_SUFFIXES: Set[str] = {
     "%",
     "/",
@@ -48,6 +51,7 @@
     "!",
     "^",
     "`",
+    "*",
     "(",
     ")",
 }
@@ -196,8 +200,13 @@ def init_module():
     global BOXING_CONSTRUCT_SUFFIXES
 
     BOXING_CONSTRUCT_SUFFIXES = set(
-        [op_str[-1] for op_str in OPERATOR_DATA["box-operators"].values()]
-    ) | set([")", "("])
+        [
+            op_str[-1]
+            for op_str in itertools.chain.from_iterable(
+                OPERATOR_DATA["box-operators"].values()
+            )
+        ]
+    ) | set(["*", ")", "("])
 
     global NO_MEANING_OPERATORS
     NO_MEANING_OPERATORS = (

From 0f0418d32d7ba4966ec7025ebd87281a133e470d Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Tue, 3 Jun 2025 08:51:00 -0400
Subject: [PATCH 34/34] Remove duplicate test

---
 test/test_string_tokens.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py
index 193be610..84660148 100644
--- a/test/test_string_tokens.py
+++ b/test/test_string_tokens.py
@@ -123,11 +123,6 @@ def test_hexadecimal_colon():
         '"θ"',
         "4-digit hexadecimal number test with uppercase alpha letter",
     )
-    check_string(
-        r'"\:03b8"',
-        '"\u03b8"',
-        "4-digit hexadecimal number test with lowercase alpha lettter",
-    )
     check_string(r'"\:0030"', '"0"')
     check_string(
         r"\:03b8",