diff --git a/src/flisp/julia_extensions.c b/src/flisp/julia_extensions.c index 091a0799edb5f..11ed8147e3005 100644 --- a/src/flisp/julia_extensions.c +++ b/src/flisp/julia_extensions.c @@ -40,11 +40,67 @@ value_t fl_skipws(value_t *args, u_int32_t nargs) return skipped; } +static int is_wc_cat_id_start(uint32_t wc, utf8proc_propval_t cat) +{ + return (cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LL || + cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LM || + cat == UTF8PROC_CATEGORY_LO || cat == UTF8PROC_CATEGORY_NL || + // allow currency symbols + cat == UTF8PROC_CATEGORY_SC || + // allow all latin-1 characters except math symbols and quotes + (wc <= 0xff && cat != UTF8PROC_CATEGORY_SM && + cat != UTF8PROC_CATEGORY_PF && cat != UTF8PROC_CATEGORY_PI) || + // Other_ID_Start + wc == 0x2118 || wc == 0x212E || (wc >= 0x309B && wc <= 0x309C)); +} + +static int jl_id_start_char(uint32_t wc) +{ + if ((wc >= 'A' && wc <= 'Z') || (wc >= 'a' && wc <= 'z') || wc == '_') + return 1; + if (wc < 0xA1 || wc > 0x10ffff) + return 0; + const utf8proc_property_t *prop = utf8proc_get_property(wc); + return is_wc_cat_id_start(wc, prop->category); +} + static int jl_id_char(uint32_t wc) { - return ((wc >= 'A' && wc <= 'Z') || (wc >= 'a' && wc <= 'z') || - (wc >= '0' && wc <= '9') || (wc >= 0xA1) || - wc == '!' || wc == '_'); + if ((wc >= 'A' && wc <= 'Z') || (wc >= 'a' && wc <= 'z') || wc == '_' || + (wc >= '0' && wc <= '9') || wc == '!') + return 1; + if (wc < 0xA1 || wc > 0x10ffff) + return 0; + const utf8proc_property_t *prop = utf8proc_get_property(wc); + utf8proc_propval_t cat = prop->category; + if (is_wc_cat_id_start(wc, cat)) return 1; + if (cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_MC || + cat == UTF8PROC_CATEGORY_ND || cat == UTF8PROC_CATEGORY_PC || + cat == UTF8PROC_CATEGORY_SK || + // primes + (wc >= 0x2032 && wc <= 0x2034) || + // Other_ID_Continue + wc == 0x0387 || wc == 0x19da || (wc >= 0x1369 && wc <= 0x1371)) + return 1; + return 0; +} + +value_t fl_julia_identifier_char(value_t *args, u_int32_t nargs) +{ + argcount("identifier-char?", nargs, 1); + if (!iscprim(args[0]) || ((cprim_t*)ptr(args[0]))->type != wchartype) + type_error("identifier-char?", "wchar", args[0]); + uint32_t wc = *(uint32_t*)cp_data((cprim_t*)ptr(args[0])); + return jl_id_char(wc); +} + +value_t fl_julia_identifier_start_char(value_t *args, u_int32_t nargs) +{ + argcount("identifier-start-char?", nargs, 1); + if (!iscprim(args[0]) || ((cprim_t*)ptr(args[0]))->type != wchartype) + type_error("identifier-start-char?", "wchar", args[0]); + uint32_t wc = *(uint32_t*)cp_data((cprim_t*)ptr(args[0])); + return jl_id_start_char(wc); } // return NFC-normalized UTF8-encoded version of s @@ -105,6 +161,8 @@ value_t fl_accum_julia_symbol(value_t *args, u_int32_t nargs) static builtinspec_t julia_flisp_func_info[] = { { "skip-ws", fl_skipws }, { "accum-julia-symbol", fl_accum_julia_symbol }, + { "identifier-char?", fl_julia_identifier_char }, + { "identifier-start-char?", fl_julia_identifier_start_char }, { NULL, NULL } }; diff --git a/src/julia-parser.scm b/src/julia-parser.scm index 67a145217b373..481c427fa5cdb 100644 --- a/src/julia-parser.scm +++ b/src/julia-parser.scm @@ -117,14 +117,6 @@ (let ((chrs (string->list "()[]{},;\"`@"))) (lambda (c) (memv c chrs)))) (define (newline? c) (eqv? c #\newline)) -(define (identifier-char? c) (or (and (char>=? c #\A) - (char<=? c #\Z)) - (and (char>=? c #\a) - (char<=? c #\z)) - (and (char>=? c #\0) - (char<=? c #\9)) - (char>=? c #\uA1) - (eqv? c #\_))) ;; characters that can be in an operator (define (opchar? c) (and (char? c) (string.find op-chars c))) ;; characters that can follow . in an operator @@ -418,7 +410,7 @@ ((opchar? c) (read-operator port (read-char port))) - ((identifier-char? c) (accum-julia-symbol c port)) + ((identifier-start-char? c) (accum-julia-symbol c port)) (else (error (string "invalid character \"" (read-char port) "\"")))))) @@ -1523,7 +1515,7 @@ (define (parse-interpolate s) (let* ((p (ts:port s)) (c (peek-char p))) - (cond ((identifier-char? c) + (cond ((identifier-start-char? c) (parse-atom s)) ((eqv? c #\() (read-char p)