Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 17 additions & 24 deletions src/lexer.mll
Original file line number Diff line number Diff line change
Expand Up @@ -22,23 +22,19 @@ let error_nest start lexbuf msg =
lexbuf.Lexing.lex_start_p <- start;
error lexbuf msg

let classify_utf8_leader lexbuf = Int32.(function
| ch when logand ch (lognot 0b01111111l) = 0b00000000l -> 0
| ch when logand ch (lognot 0b00011111l) = 0b11000000l -> 1
| ch when logand ch (lognot 0b00001111l) = 0b11100000l -> 2
| ch when logand ch (lognot 0b00000111l) = 0b11110000l -> 3
| ch -> error lexbuf (Printf.sprintf "invalid utf-8 character: 0x%x" (Int32.to_int ch)))

let utf8_decoder l lexbuf s i =
let leading = classify_utf8_leader lexbuf (Int32.of_int (Char.code s.[!i]))
in if leading = 0 then Char.code s.[!i]
else match Utf8.decode (String.sub s !i (1 + leading)) with
| [code] -> i := !i + leading; code
| _ -> error lexbuf "can not interpret unicode character"

let unicode lexbuf s i decoder =

let utf8 lexbuf s i =
let len =
if s.[!i] < '\x80' then 0 else
if s.[!i] < '\xe0' then 1 else
if s.[!i] < '\xf0' then 2 else 3
in
i := !i + len;
List.hd (Utf8.decode (String.sub s (!i - len) (1 + len)))

let unicode lexbuf s i =
let u =
if s.[!i] <> '\\' then decoder lexbuf s i else
if s.[!i] <> '\\' then utf8 lexbuf s i else
match (incr i; s.[!i]) with
| 'n' -> Char.code '\n'
| 'r' -> Char.code '\r'
Expand All @@ -58,17 +54,14 @@ let unicode lexbuf s i decoder =
int_of_string ("0x" ^ String.make 1 h ^ String.make 1 s.[!i])
in incr i; u

let char lexbuf s = unicode lexbuf s (ref 1) (fun _ _ _ ->
match Utf8.decode s with
| [39; code; 39] -> code (* surrounded by apostrophes *)
| _ -> error lexbuf "can not interpret unicode character")
let char lexbuf s =
unicode lexbuf s (ref 1)

let text lexbuf s =
let l = String.length s in
let b = Buffer.create l in
let b = Buffer.create (String.length s) in
let i = ref 1 in
while !i < l - 1 do
let bs = Utf8.encode [unicode lexbuf s i (utf8_decoder l)] in
while !i < String.length s - 1 do
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you really want to get the string's length in every iteration? Or is there some guaranteed CVE at work here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

String length is a constant time operation in every language that isn't Haskell. ;)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know C? ;-)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe not considered a language worth considering.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or not a language that has an actual string type. :)

let bs = Utf8.encode [unicode lexbuf s i] in
Buffer.add_substring b bs 0 (String.length bs)
done;
Buffer.contents b
Expand Down