Skip to content

Commit

Permalink
use unicode character categories to classify identifier characters
Browse files Browse the repository at this point in the history
fixes #6797, fixes #5936
  • Loading branch information
JeffBezanson committed May 13, 2014
1 parent f4d1e94 commit 82e34b6
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 13 deletions.
64 changes: 61 additions & 3 deletions src/flisp/julia_extensions.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,67 @@ value_t fl_skipws(value_t *args, u_int32_t nargs)
return skipped;
}

static int is_wc_cat_id_start(uint32_t wc, utf8proc_propval_t cat)
{
return (cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LL ||
cat == UTF8PROC_CATEGORY_LT || cat == UTF8PROC_CATEGORY_LM ||
cat == UTF8PROC_CATEGORY_LO || cat == UTF8PROC_CATEGORY_NL ||
// allow currency symbols
cat == UTF8PROC_CATEGORY_SC ||
// allow all latin-1 characters except math symbols and quotes
(wc <= 0xff && cat != UTF8PROC_CATEGORY_SM &&
cat != UTF8PROC_CATEGORY_PF && cat != UTF8PROC_CATEGORY_PI) ||
// Other_ID_Start
wc == 0x2118 || wc == 0x212E || (wc >= 0x309B && wc <= 0x309C));
}

static int jl_id_start_char(uint32_t wc)
{
if ((wc >= 'A' && wc <= 'Z') || (wc >= 'a' && wc <= 'z') || wc == '_')
return 1;
if (wc < 0xA1 || wc > 0x10ffff)
return 0;
const utf8proc_property_t *prop = utf8proc_get_property(wc);
return is_wc_cat_id_start(wc, prop->category);
}

static int jl_id_char(uint32_t wc)
{
return ((wc >= 'A' && wc <= 'Z') || (wc >= 'a' && wc <= 'z') ||
(wc >= '0' && wc <= '9') || (wc >= 0xA1) ||
wc == '!' || wc == '_');
if ((wc >= 'A' && wc <= 'Z') || (wc >= 'a' && wc <= 'z') || wc == '_' ||
(wc >= '0' && wc <= '9') || wc == '!')
return 1;
if (wc < 0xA1 || wc > 0x10ffff)
return 0;
const utf8proc_property_t *prop = utf8proc_get_property(wc);
utf8proc_propval_t cat = prop->category;
if (is_wc_cat_id_start(wc, cat)) return 1;
if (cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_MC ||
cat == UTF8PROC_CATEGORY_ND || cat == UTF8PROC_CATEGORY_PC ||
cat == UTF8PROC_CATEGORY_SK ||
// primes
(wc >= 0x2032 && wc <= 0x2034) ||
// Other_ID_Continue
wc == 0x0387 || wc == 0x19da || (wc >= 0x1369 && wc <= 0x1371))
return 1;
return 0;
}

value_t fl_julia_identifier_char(value_t *args, u_int32_t nargs)
{
argcount("identifier-char?", nargs, 1);
if (!iscprim(args[0]) || ((cprim_t*)ptr(args[0]))->type != wchartype)
type_error("identifier-char?", "wchar", args[0]);
uint32_t wc = *(uint32_t*)cp_data((cprim_t*)ptr(args[0]));
return jl_id_char(wc);
}

value_t fl_julia_identifier_start_char(value_t *args, u_int32_t nargs)
{
argcount("identifier-start-char?", nargs, 1);
if (!iscprim(args[0]) || ((cprim_t*)ptr(args[0]))->type != wchartype)
type_error("identifier-start-char?", "wchar", args[0]);
uint32_t wc = *(uint32_t*)cp_data((cprim_t*)ptr(args[0]));
return jl_id_start_char(wc);
}

// return NFC-normalized UTF8-encoded version of s
Expand Down Expand Up @@ -105,6 +161,8 @@ value_t fl_accum_julia_symbol(value_t *args, u_int32_t nargs)
static builtinspec_t julia_flisp_func_info[] = {
{ "skip-ws", fl_skipws },
{ "accum-julia-symbol", fl_accum_julia_symbol },
{ "identifier-char?", fl_julia_identifier_char },
{ "identifier-start-char?", fl_julia_identifier_start_char },
{ NULL, NULL }
};

Expand Down
12 changes: 2 additions & 10 deletions src/julia-parser.scm
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,6 @@
(let ((chrs (string->list "()[]{},;\"`@")))
(lambda (c) (memv c chrs))))
(define (newline? c) (eqv? c #\newline))
(define (identifier-char? c) (or (and (char>=? c #\A)
(char<=? c #\Z))
(and (char>=? c #\a)
(char<=? c #\z))
(and (char>=? c #\0)
(char<=? c #\9))
(char>=? c #\uA1)
(eqv? c #\_)))
;; characters that can be in an operator
(define (opchar? c) (and (char? c) (string.find op-chars c)))
;; characters that can follow . in an operator
Expand Down Expand Up @@ -418,7 +410,7 @@

((opchar? c) (read-operator port (read-char port)))

((identifier-char? c) (accum-julia-symbol c port))
((identifier-start-char? c) (accum-julia-symbol c port))

(else (error (string "invalid character \"" (read-char port) "\""))))))

Expand Down Expand Up @@ -1523,7 +1515,7 @@
(define (parse-interpolate s)
(let* ((p (ts:port s))
(c (peek-char p)))
(cond ((identifier-char? c)
(cond ((identifier-start-char? c)
(parse-atom s))
((eqv? c #\()
(read-char p)
Expand Down

23 comments on commit 82e34b6

@staticfloat
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JeffBezanson Ever since this commit, I can no longer using AWS successfully.

On 82e34b6, we seem to have a memory leak, as calling using AWS will freeze, then eat up all available memory until I forcibly kill it.

On f5b5b63 (which is master at the time of writing) I get an error message:

$ ./julia
               _
   _       _ _(_)_     |  A fresh approach to technical computing
  (_)     | (_) (_)    |  Documentation: http://docs.julialang.org
   _ _   _| |_  __ _   |  Type "help()" to list help topics
  | | | | | | |/ _` |  |
  | | |_| | | | (_| |  |  Version 0.3.0-prerelease+3055 (2014-05-14 22:47 UTC)
 _/ |\__'_|_|_|\__'_|  |  Commit f5b5b63 (0 days old master)
|__/                   |  x86_64-apple-darwin12.5.0

julia> using AWS
ERROR: syntax: invalid character "​"
 in include at boot.jl:244 (repeats 2 times)
 in reload_path at loading.jl:152
 in _require at loading.jl:67
 in require at loading.jl:51
while loading /Users/sabae/.julia/AWS/src/S3.jl, in expression starting on line 1
while loading /Users/sabae/.julia/AWS/src/AWS.jl, in expression starting on line 84

@JeffBezanson
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

82e34b6 itself is no good; it was fixed by subsequent commits.

In s3_types.jl there is a zero-width space on line 652. I suppose that could be considered an identifier-continue character?

@staticfloat
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JeffBezanson
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another possibility is that we should strip "default-ignorable" characters entirely. Any opinions @stevengj @jiahao ?

@StefanKarpinski
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about printing a clear error message? Having zero-width spaces in code seems like asking for trouble.

@jiahao
Copy link
Member

@jiahao jiahao commented on 82e34b6 May 15, 2014

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm in favor of the error message option. Some of the 'default-ignorable' characters like soft hyphen, \u00ad, have rather complex behaviors that may be semantically meaningful in context. (Others, like zero-width joiner, \u200d, seem like pure trouble.)

@JeffBezanson
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why use the zero-width joiner when you can use the zero-width non-joiner? Or my favorite, the zero-width non-breaking space.

@jiahao
Copy link
Member

@jiahao jiahao commented on 82e34b6 May 15, 2014

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This discussion lacks breadth, period.

@amitmurthy
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, I don't know how those characters got in in the first place. I use Kate as an editor. And that particular file may have had quite a bit of Copy-Paste of variable names from either Amazon's documentation on the web or from a pdf.

Thus, these characters may creep in again in regular use in other similar circumstances. So, I would prefer "stripping 'default-ignorable' characters entirely"

@staticfloat
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jiahao you deserve some kind of award. Also, "breadth."

@jiahao
Copy link
Member

@jiahao jiahao commented on 82e34b6 May 15, 2014

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TIL that it is possible to deprecate the meaning of characters in the Unicode standard. Somehow I feel that this would make for excellent Existential Comics fodder.

@mlubin
Copy link
Member

@mlubin mlubin commented on 82e34b6 May 15, 2014

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@MichaelHatherly reported that ∑ (summation) is no longer accepted as a valid symbol name. Σ (sigma) is still valid. Is this intended?

@stevengj
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mlubin, I think we didn't want to include the entire category Sm as valid identifier characters, because many of these should be infix operators instead.

However, we should probably whitelist more of the category Sm characters (currently just ℘ is allowed from Sm) as allowed identifiers. I would suggest at least those in https://gist.github.com/stevengj/0dd4927f019bf504df47

(I can't paste them in here as Github doesn't allow some of those unicode characters in comments, grrr.)

@stevengj
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One possibility would be to make everything in category Sm an infix operator except a small whitelist of identifier chars (plus a small list of prefix operators).

(Alternatively, make everything in Sm a valid identifier except a list of infix operators. However, it seems like there are a lot more infix operators than identifier chars in Sm. See my comment in #6582.)

@mlubin
Copy link
Member

@mlubin mlubin commented on 82e34b6 May 15, 2014

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In JuMP we recently supported ∑{} and Σ{} for sum{} and ∏{} for prod{}. I didn't realize the product character was no longer valid either.

@IainNZ @joehuchette

@JeffBezanson
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I think we should whitelist those.

@IainNZ
Copy link
Member

@IainNZ IainNZ commented on 82e34b6 May 15, 2014

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@stevengj
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, because of the precedence issue (we need to manually pick what precedence each infix operator has), it might actually be better to:

This would also be easier to implement for 0.3 (adding Sm is a one-liner) and less traumatic in terms of backwards compatibility with JuMP etc.

@Keno
Copy link
Member

@Keno Keno commented on 82e34b6 May 20, 2014

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May I also request the super and subscript +-= 207A-208C as valid identifiers?

@stevengj
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's please just allow anything in Sm as an identifier unless it is is specifically listed as an infix/prefix operator. @loladiro, super/subscript +/– are in Sm, so this would fix that.

@Keno
Copy link
Member

@Keno Keno commented on 82e34b6 May 20, 2014

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that seems like the correct solution to me.

@JeffBezanson
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The safest thing is to leave most characters invalid, and then gradually classify them as either identifier characters or correctly-parsed operators. We already have an identifier character whitelist and an operator list, and we will keep growing both of those. I think it is equally difficult to enumerate either the operators in Sm or the non-operators in Sm.

@stevengj
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JeffBezanson, I've come to the conclusion that you're probably right in this, since switching a character from an identifier-char (if Sm were allowed by default) to an operator-char would cause unexpected code breakage.

Please sign in to comment.