diff --git a/src/lexer.mll b/src/lexer.mll index b64030fe1a6..2f5836c36a5 100644 --- a/src/lexer.mll +++ b/src/lexer.mll @@ -22,9 +22,23 @@ let error_nest start lexbuf msg = lexbuf.Lexing.lex_start_p <- start; error lexbuf msg -let unicode lexbuf s i = +let classify_utf8_leader lexbuf = Int32.(function + | ch when logand ch (lognot 0b01111111l) = 0b00000000l -> 0 + | ch when logand ch (lognot 0b00011111l) = 0b11000000l -> 1 + | ch when logand ch (lognot 0b00001111l) = 0b11100000l -> 2 + | ch when logand ch (lognot 0b00000111l) = 0b11110000l -> 3 + | ch -> error lexbuf (Printf.sprintf "invalid utf-8 character: 0x%x" (Int32.to_int ch))) + +let utf8_decoder l lexbuf s i = + let leading = classify_utf8_leader lexbuf (Int32.of_int (Char.code s.[!i])) + in if leading = 0 then Char.code s.[!i] + else match Utf8.decode (String.sub s !i (1 + leading)) with + | [code] -> i := !i + leading; code + | _ -> error lexbuf "can not interpret unicode character" + +let unicode lexbuf s i decoder = let u = - if s.[!i] <> '\\' then Char.code s.[!i] else + if s.[!i] <> '\\' then decoder lexbuf s i else match (incr i; s.[!i]) with | 'n' -> Char.code '\n' | 'r' -> Char.code '\r' @@ -44,14 +58,17 @@ let unicode lexbuf s i = int_of_string ("0x" ^ String.make 1 h ^ String.make 1 s.[!i]) in incr i; u -let char lexbuf s = - unicode lexbuf s (ref 1) +let char lexbuf s = unicode lexbuf s (ref 1) (fun _ _ _ -> + match Utf8.decode s with + | [39; code; 39] -> code (* surrounded by apostrophes *) + | _ -> error lexbuf "can not interpret unicode character") let text lexbuf s = - let b = Buffer.create (String.length s) in + let l = String.length s in + let b = Buffer.create l in let i = ref 1 in - while !i < String.length s - 1 do - let bs = Utf8.encode [unicode lexbuf s i] in + while !i < l - 1 do + let bs = Utf8.encode [unicode lexbuf s i (utf8_decoder l)] in Buffer.add_substring b bs 0 (String.length bs) done; Buffer.contents b diff --git a/test/run/conversions.as b/test/run/conversions.as index cd1cf3c2021..60afd31f484 100644 --- a/test/run/conversions.as +++ b/test/run/conversions.as @@ -143,6 +143,7 @@ println(word32ToInt 4294967295); // == (-1) // 2**32 - 1 assert(charToWord32 '\u{00}' == (0 : Word32)); assert(charToWord32 '*' == (42 : Word32)); +assert(charToWord32 'ะŸ' == (1055 : Word32)); assert(charToWord32 '\u{ffff}' == (65535 : Word32)); // 2**16 - 1 assert(charToWord32 '\u{10ffff}' == (0x10FFFF : Word32)); diff --git a/test/run/literals.as b/test/run/literals.as index 42060c4348f..b87c333bc96 100644 --- a/test/run/literals.as +++ b/test/run/literals.as @@ -6,4 +6,7 @@ let byte : Word8 = 0xFF : Word8; let short : Word16 = 0xFFFF : Word16; let word : Word32 = 0xFFFF_FFFF : Word32; let u = '\u{a34}'; +let gu = '๐ŸŽธ'; +let ru = "ะŸั€ะธะฒะตั‚ัั‚ะฒัƒัŽ, ะผะธั€!\n"; let s = "a \t\22\00bb\'bc\\de \74xx\\x\"\u{000_234_42}\n"; +let emojis = "๐Ÿ™ˆ๐ŸŽธ๐Ÿ˜‹";