use unicode character categories to classify identifier characters

fixes #6797, fixes #5936
JuliaLang · May 13, 2014 · 82e34b6 · 82e34b6 · staticfloat · May 14, 2014
1 parent f4d1e94
commit 82e34b6
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 13 deletions.
diff --git a/src/flisp/julia_extensions.c b/src/flisp/julia_extensions.c
@@ -40,11 +40,67 @@ value_t fl_skipws(value_t *args, u_int32_t nargs)
     return skipped;
 }
 
+static int is_wc_cat_id_start(uint32_t wc, utf8proc_propval_t cat)
+{
+    return (cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LL ||
+            cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LM ||
+            cat == UTF8PROC_CATEGORY_LO || cat == UTF8PROC_CATEGORY_NL ||
+            // allow currency symbols
+            cat == UTF8PROC_CATEGORY_SC ||
+            // allow all latin-1 characters except math symbols and quotes
+            (wc <= 0xff && cat != UTF8PROC_CATEGORY_SM &&
+             cat != UTF8PROC_CATEGORY_PF && cat != UTF8PROC_CATEGORY_PI) ||
+            // Other_ID_Start
+            wc == 0x2118 || wc == 0x212E || (wc >= 0x309B && wc <= 0x309C));
+}
+
+static int jl_id_start_char(uint32_t wc)
+{
+    if ((wc >= 'A' && wc <= 'Z') || (wc >= 'a' && wc <= 'z') || wc == '_')
+        return 1;
+    if (wc < 0xA1 || wc > 0x10ffff)
+        return 0;
+    const utf8proc_property_t *prop = utf8proc_get_property(wc);
+    return is_wc_cat_id_start(wc, prop->category);
+}
+
 static int jl_id_char(uint32_t wc)
 {
-    return ((wc >= 'A' && wc <= 'Z') || (wc >= 'a' && wc <= 'z') ||
-            (wc >= '0' && wc <= '9') || (wc >= 0xA1) ||
-            wc == '!' || wc == '_');
+    if ((wc >= 'A' && wc <= 'Z') || (wc >= 'a' && wc <= 'z') || wc == '_' ||
+        (wc >= '0' && wc <= '9') || wc == '!')
+        return 1;
+    if (wc < 0xA1 || wc > 0x10ffff)
+        return 0;
+    const utf8proc_property_t *prop = utf8proc_get_property(wc);
+    utf8proc_propval_t cat = prop->category;
+    if (is_wc_cat_id_start(wc, cat)) return 1;
+    if (cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_MC ||
+        cat == UTF8PROC_CATEGORY_ND || cat == UTF8PROC_CATEGORY_PC ||
+        cat == UTF8PROC_CATEGORY_SK ||
+        // primes
+        (wc >= 0x2032 && wc <= 0x2034) ||
+        // Other_ID_Continue
+        wc == 0x0387 || wc == 0x19da || (wc >= 0x1369 && wc <= 0x1371))
+        return 1;
+    return 0;
+}
+
+value_t fl_julia_identifier_char(value_t *args, u_int32_t nargs)
+{
+    argcount("identifier-char?", nargs, 1);
+    if (!iscprim(args[0]) || ((cprim_t*)ptr(args[0]))->type != wchartype)
+        type_error("identifier-char?", "wchar", args[0]);
+    uint32_t wc = *(uint32_t*)cp_data((cprim_t*)ptr(args[0]));
+    return jl_id_char(wc);
+}
+
+value_t fl_julia_identifier_start_char(value_t *args, u_int32_t nargs)
+{
+    argcount("identifier-start-char?", nargs, 1);
+    if (!iscprim(args[0]) || ((cprim_t*)ptr(args[0]))->type != wchartype)
+        type_error("identifier-start-char?", "wchar", args[0]);
+    uint32_t wc = *(uint32_t*)cp_data((cprim_t*)ptr(args[0]));
+    return jl_id_start_char(wc);
 }
 
 // return NFC-normalized UTF8-encoded version of s
@@ -105,6 +161,8 @@ value_t fl_accum_julia_symbol(value_t *args, u_int32_t nargs)
 static builtinspec_t julia_flisp_func_info[] = {
     { "skip-ws", fl_skipws },
     { "accum-julia-symbol", fl_accum_julia_symbol },
+    { "identifier-char?", fl_julia_identifier_char },
+    { "identifier-start-char?", fl_julia_identifier_start_char },
     { NULL, NULL }
 };
 

diff --git a/src/julia-parser.scm b/src/julia-parser.scm
@@ -117,14 +117,6 @@
   (let ((chrs (string->list "()[]{},;\"`@")))
     (lambda (c) (memv c chrs))))
 (define (newline? c) (eqv? c #\newline))
-(define (identifier-char? c) (or (and (char>=? c #\A)
-				      (char<=? c #\Z))
-				 (and (char>=? c #\a)
-				      (char<=? c #\z))
-				 (and (char>=? c #\0)
-				      (char<=? c #\9))
-				 (char>=? c #\uA1)
-				 (eqv? c #\_)))
 ;; characters that can be in an operator
 (define (opchar? c) (and (char? c) (string.find op-chars c)))
 ;; characters that can follow . in an operator
@@ -418,7 +410,7 @@
 
 	  ((opchar? c)  (read-operator port (read-char port)))
 
-	  ((identifier-char? c) (accum-julia-symbol c port))
+	  ((identifier-start-char? c) (accum-julia-symbol c port))
 
 	  (else (error (string "invalid character \"" (read-char port) "\""))))))
 
@@ -1523,7 +1515,7 @@
 (define (parse-interpolate s)
   (let* ((p (ts:port s))
          (c (peek-char p)))
-    (cond ((identifier-char? c)
+    (cond ((identifier-start-char? c)
            (parse-atom s))
           ((eqv? c #\()
            (read-char p)