diff --git a/src/lexer.mll b/src/lexer.mll index 2f5836c36a5..b8aa9bbb8f5 100644 --- a/src/lexer.mll +++ b/src/lexer.mll @@ -22,23 +22,19 @@ let error_nest start lexbuf msg = lexbuf.Lexing.lex_start_p <- start; error lexbuf msg -let classify_utf8_leader lexbuf = Int32.(function - | ch when logand ch (lognot 0b01111111l) = 0b00000000l -> 0 - | ch when logand ch (lognot 0b00011111l) = 0b11000000l -> 1 - | ch when logand ch (lognot 0b00001111l) = 0b11100000l -> 2 - | ch when logand ch (lognot 0b00000111l) = 0b11110000l -> 3 - | ch -> error lexbuf (Printf.sprintf "invalid utf-8 character: 0x%x" (Int32.to_int ch))) - -let utf8_decoder l lexbuf s i = - let leading = classify_utf8_leader lexbuf (Int32.of_int (Char.code s.[!i])) - in if leading = 0 then Char.code s.[!i] - else match Utf8.decode (String.sub s !i (1 + leading)) with - | [code] -> i := !i + leading; code - | _ -> error lexbuf "can not interpret unicode character" - -let unicode lexbuf s i decoder = + +let utf8 lexbuf s i = + let len = + if s.[!i] < '\x80' then 0 else + if s.[!i] < '\xe0' then 1 else + if s.[!i] < '\xf0' then 2 else 3 + in + i := !i + len; + List.hd (Utf8.decode (String.sub s (!i - len) (1 + len))) + +let unicode lexbuf s i = let u = - if s.[!i] <> '\\' then decoder lexbuf s i else + if s.[!i] <> '\\' then utf8 lexbuf s i else match (incr i; s.[!i]) with | 'n' -> Char.code '\n' | 'r' -> Char.code '\r' @@ -58,17 +54,14 @@ let unicode lexbuf s i decoder = int_of_string ("0x" ^ String.make 1 h ^ String.make 1 s.[!i]) in incr i; u -let char lexbuf s = unicode lexbuf s (ref 1) (fun _ _ _ -> - match Utf8.decode s with - | [39; code; 39] -> code (* surrounded by apostrophes *) - | _ -> error lexbuf "can not interpret unicode character") +let char lexbuf s = + unicode lexbuf s (ref 1) let text lexbuf s = - let l = String.length s in - let b = Buffer.create l in + let b = Buffer.create (String.length s) in let i = ref 1 in - while !i < l - 1 do - let bs = Utf8.encode [unicode lexbuf s i (utf8_decoder l)] in + while !i < String.length s - 1 do + let bs = Utf8.encode [unicode lexbuf s i] in Buffer.add_substring b bs 0 (String.length bs) done; Buffer.contents b