Mathics3 · rocky · Jun 3, 2025 · Apr 11, 2025 · Apr 12, 2025 · Apr 12, 2025
diff --git a/.github/workflows/mathics.yml b/.github/workflows/mathics.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.11']
+        python-version: ['3.12']
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
@@ -33,7 +33,7 @@ jobs:
         git clone --depth 1 https://github.com/Mathics3/mathics-scanner.git
         (cd mathics-scanner && pip install -e .)
         # Until next Mathics3/mathics-core release is out...
-        git clone --depth 1 https://github.com/Mathics3/mathics-core.git
+        git clone --depth 1 --branch revise-escape-sequence-scanning https://github.com/Mathics3/mathics-core.git
         cd mathics-core/
         make PIP_INSTALL_OPTS='[full]'
         # pip install Mathics3[full]

diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -10,7 +10,7 @@ Tokenization
 
 Tokenization is performed by the ``Tokeniser`` class. The ``next`` method
 consumes characters from a feeder and returns a token if the tokenization
-succeeds. If the tokenization fails an instance of ``TranslateError`` is
+succeeds. If the tokenization fails an instance of ``SyntaxError`` is
 raised.
 
 .. autoclass:: Tokeniser(object)

diff --git a/mathics_scanner/__init__.py b/mathics_scanner/__init__.py
@@ -15,9 +15,7 @@
 from mathics_scanner.errors import (
     IncompleteSyntaxError,
     InvalidSyntaxError,
-    ScanError,
-    TranslateError,
-    TranslateErrorNew,
+    SyntaxError,
 )
 from mathics_scanner.feed import (
     FileLineFeeder,
@@ -36,12 +34,10 @@
     "InvalidSyntaxError",
     "LineFeeder",
     "MultiLineFeeder",
-    "ScanError",
+    "SyntaxError",
     "SingleLineFeeder",
     # "Token",
     # "Tokeniser",
-    "TranslateError",
-    "TranslateErrorNew",
     "__version__",
     "aliased_characters",
     # "is_symbol_name",

diff --git a/mathics_scanner/data/named-characters.yml b/mathics_scanner/data/named-characters.yml
@@ -68,7 +68,7 @@
 #               the named character. If it is the same as unicode-equivalent
 #               it should be omitted
 #
-#   wl-unicode-name: The name of the character corresponding to `wl-unicode`, if it exists. If it is the same as unicode-equivalent-name it can be omitted.
+#   wl-unicode-name: The name of the character corresponding to `wl-unicode`, if it exists.
 #                    It will mentioned in Wolfram Language docs if it exists.
 #
 # Sources:
@@ -6628,6 +6628,16 @@ LightBulb:
   wl-reference: https://reference.wolfram.com/language/ref/character/LightBulb.html
   wl-unicode: "\uF723"
 
+LineSeparator:
+  has-unicode-inverse: false
+  is-letter-like: false
+  unicode-equivalent: "\u2028"
+  unicode-equivalent-name: LINE SEPARATOR
+  unicode-reference: https://www.compart.com/en/unicode/U+2028
+  wl-reference: https://reference.wolfram.com/language/ref/character/LineSeparator.html
+  wl-unicode: "\u2028"
+  wl-unicode-name: LINE SEPARATOR
+
 LongDash:
   esc-alias: --
   has-unicode-inverse: false

diff --git a/mathics_scanner/errors.py b/mathics_scanner/errors.py
@@ -1,46 +1,39 @@
 # -*- coding: utf-8 -*-
 
 
-class TranslateErrorNew(Exception):
+class SyntaxError(Exception):
+    """Some sort of error in the scanning or tokenization phase parsing Mathics3.
+
+    There are more specific kinds of exceptions subclassed from this
+    exception class.
+    """
+
     def __init__(self, tag: str, *args):
         super().__init__()
         self.name = "Syntax"
         self.tag = tag
         self.args = args
 
 
-class TranslateError(Exception):
-    """
-    A generic class of tokenization errors. This exception is subclassed by other
-    tokenization errors
-    """
-
-
-class EscapeSyntaxError(TranslateErrorNew):
+class EscapeSyntaxError(SyntaxError):
     """Escape sequence syntax error"""
 
     pass
 
 
-class IncompleteSyntaxError(TranslateErrorNew):
+class IncompleteSyntaxError(SyntaxError):
     """More characters were expected to form a valid token"""
 
     pass
 
 
-class InvalidSyntaxError(TranslateErrorNew):
+class InvalidSyntaxError(SyntaxError):
     """Invalid syntax"""
 
     pass
 
 
-class NamedCharacterSyntaxError(TranslateError):
+class NamedCharacterSyntaxError(EscapeSyntaxError):
     """Named character syntax error"""
 
     pass
-
-
-class ScanError(TranslateErrorNew):
-    """A generic scanning error"""
-
-    pass
diff --git a/mathics_scanner/escape_sequences.py b/mathics_scanner/escape_sequences.py
@@ -0,0 +1,148 @@
+"""
+Helper Module for tokenizing character escape sequences.
+"""
+
+from typing import Optional, Tuple
+
+from mathics_scanner.characters import named_characters
+from mathics_scanner.errors import (
+    EscapeSyntaxError,
+    NamedCharacterSyntaxError,
+    SyntaxError,
+)
+
+
+def parse_base(source_text: str, start_shift: int, end_shift: int, base: int) -> str:
+    r"""
+    See if characters start_shift .. end shift
+    can be converted to an integer in base  ``base``.
+
+    If so, chr(integer value converted from base) is returnd.
+
+    However, if the conversion fails, SyntaxError is raised.
+    """
+    last = end_shift - start_shift
+    if last == 2:
+        tag = "sntoct2"
+    elif last == 3:
+        assert base == 8, "Only octal requires 3 digits"
+        tag = "sntoct1"
+    elif last in (4, 6):
+        tag = "snthex"
+    else:
+        raise ValueError()
+
+    if end_shift > len(source_text):
+        raise SyntaxError("Syntax", tag)
+
+    assert start_shift <= end_shift
+    text = source_text[start_shift:end_shift]
+    try:
+        result = int(text, base)
+    except ValueError:
+        raise SyntaxError(tag, source_text[start_shift:].rstrip("\n"))
+
+    return chr(result)
+
+
+def parse_named_character(source_text: str, start: int, finish: int) -> Optional[str]:
+    r"""
+    Find the unicode-equivalent symbol for a string named character.
+
+    Before calling we have matched the text between "\["  and "]" of the input.
+
+    The name character is thus in source_text[start:finish].
+
+    Match this string with the known named characters,
+    e.g. "Theta".  If we can match this, then we return the unicode equivalent from the
+    `named_characters` map (which is read in from JSON but stored in a YAML file).
+
+    If we can't find the named character, raise NamedCharacterSyntaxError.
+    """
+    named_character = source_text[start:finish]
+    if named_character.isalpha():
+        char = named_characters.get(named_character)
+        if char is None:
+            raise NamedCharacterSyntaxError("sntufn", named_character)
+        else:
+            return char
+
+
+def parse_escape_sequence(source_text: str, pos: int) -> Tuple[str, int]:
+    """Given some source text in `source_text` starting at offset
+    `pos`, return the escape-sequence value for this text and the
+    follow-on offset position.
+    """
+    result = ""
+    c = source_text[pos]
+    if c == "\\":
+        return "\\", pos + 1
+
+    # https://www.wolfram.com/language/12/networking-and-system-operations/use-the-full-range-of-unicode-characters.html
+    # describes hex encoding.
+    if c == ".":
+        # see if we have a 2-digit hexadecimal number.
+        # for example, \.42 is "b"
+        result += parse_base(source_text, pos + 1, pos + 3, 16)
+        pos += 3
+    elif c == ":":
+        # see if we have a 4-digit hexadecimal number.
+        # for example, \:03b8" is unicode small leter theta: θ.
+        result += parse_base(source_text, pos + 1, pos + 5, 16)
+        pos += 5
+    elif c == "|":
+        # see if we have a 6-digit hexadecimal number.
+        result += parse_base(source_text, pos + 1, pos + 7, 16)
+        pos += 7
+    elif c == "[":
+        pos += 1
+        i = pos + 1
+        while i < len(source_text):
+            if source_text[i] == "]":
+                break
+            i += 1
+        if i == len(source_text):
+            # Note: named characters do not have \n's in them. (Is this right)?
+            # FIXME: decide what to do here.
+            raise NamedCharacterSyntaxError("Syntax", "sntufn", source_text[pos:])
+
+        named_character = parse_named_character(source_text, pos, i)
+        if named_character is None:
+            raise NamedCharacterSyntaxError("Syntax", "sntufn", source_text[pos:i])
+
+        result += named_character
+        pos = i + 1
+    elif c in "01234567":
+        # See if we have a 3-digit octal number.
+        # For example \065 = "5"
+        result += parse_base(source_text, pos, pos + 3, 8)
+        pos += 3
+
+    # WMA escape characters \n, \t, \b, \r.
+    # Note that these are a similer to Python, but are different.
+    # In particular, Python defines "\a" to be ^G (control G),
+    # but in WMA, this is invalid.
+    elif c in "ntbfr $\n":
+        if c in "n\n":
+            result += "\n"
+        elif c == " ":
+            result += " "
+        elif c == "t":
+            result += "\t"
+        elif c == "b":
+            result += "\b"
+        elif c == "f":
+            result += "\f"
+        elif c in '$"':
+            # I don't know why \$ is defined, but it is!
+            result += rf"\{c}"
+        else:
+            assert c == "r"
+            result += "\r"
+        pos += 1
+    elif c in '!"':
+        result += c
+        pos += 1
+    else:
+        raise EscapeSyntaxError("stresc", rf"\{c}")
+    return result, pos
diff --git a/mathics_scanner/feed.py b/mathics_scanner/feed.py
@@ -130,22 +130,22 @@ def empty(self) -> bool:
 class SingleLineFeeder(LineFeeder):
     "A feeder that feeds all the code as a single line."
 
-    def __init__(self, code: str, filename=""):
+    def __init__(self, source_text: str, filename=""):
         """
         :param code: The source of the feeder (a string).
         :param filename: A string that describes the source of the feeder, i.e.
                          the filename that is being feed.
         """
         super().__init__(filename)
-        self.code = code
+        self.source_text = source_text
         self._empty = False
 
     def feed(self) -> str:
         if self._empty:
             return ""
         self._empty = True
         self.lineno += 1
-        return self.code
+        return self.source_text
 
     def empty(self) -> bool:
         return self._empty

diff --git a/mathics_scanner/mathics3_tokens.py b/mathics_scanner/mathics3_tokens.py
@@ -10,7 +10,7 @@
 from mathics_scanner.errors import (
     EscapeSyntaxError,
     NamedCharacterSyntaxError,
-    ScanError,
+    SyntaxError,
 )
 from mathics_scanner.feed import FileLineFeeder, LineFeeder, SingleLineFeeder
 from mathics_scanner.tokeniser import Tokeniser
@@ -162,25 +162,30 @@ def interactive_eval_loop(shell: TerminalShell, code_tokenize_format: bool):
         try:
             source_text = shell.feed()
             tokens(source_text, code_tokenize_format)
-        except ScanError:
-            shell.errmsg(
-                "Syntax",
-                "sntxi",
-                "Expression error",
-            )
-            pass
         except NamedCharacterSyntaxError:
             shell.errmsg(
                 "Syntax",
                 "sntufn",
                 "Unknown unicode longname",
             )
+        # This has to come after NamedCharacterSyntaxError
+        # since that is a subclass EscapeSyntaxError
         except EscapeSyntaxError:
             shell.errmsg(
                 "Syntax",
                 "sntufn",
                 "Unknown unicode longname",
             )
+        # This has to come after NamedCharacterSyntaxError and
+        # EscapeSyntaxError since those are subclasses of
+        # SyntaxError
+        except SyntaxError:
+            shell.errmsg(
+                "Syntax",
+                "sntxi",
+                "Expression error",
+            )
+            pass
         except KeyboardInterrupt:
             print("\nKeyboardInterrupt. Type Ctrl-D (EOF) to exit.")
         except EOFError:
@@ -199,7 +204,7 @@ def tokens(code, code_tokenize_format: bool):
     while True:
         try:
             token = tokeniser.next()
-        except ScanError as scan_error:
+        except SyntaxError as scan_error:
             mess = ""
             if scan_error.tag == "sntoct1":
                 mess = r"3 octal digits are required after \ to construct an 8-bit character"