From b04a923a5a81aec27e5e8e0960ae348c7bbd8ccf Mon Sep 17 00:00:00 2001 From: Sachin Beniwal Date: Fri, 5 Dec 2025 19:24:19 +0530 Subject: [PATCH 1/5] fix: greedy version location in lang declarations Signed-off-by: Sachin Beniwal --- src/dune_sexp/versioned_file_first_line.mll | 10 ++++++- .../test-cases/lang-invalid-version.t | 26 +++++++------------ 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/src/dune_sexp/versioned_file_first_line.mll b/src/dune_sexp/versioned_file_first_line.mll index e5459b3d079..9a8574a236c 100644 --- a/src/dune_sexp/versioned_file_first_line.mll +++ b/src/dune_sexp/versioned_file_first_line.mll @@ -32,7 +32,15 @@ and atom start = parse { atom start lexbuf } | atom_char+ as s - { (Loc.of_lexbuf lexbuf, s) + { let atom_start = Lexing.lexeme_start_p lexbuf in + let has_non_ascii = String.exists s ~f:(fun c -> Char.code c >= 128) in + let length = if has_non_ascii then 1 else String.length s in + let atom_end = + { atom_start with + pos_cnum = atom_start.pos_cnum + length + } + in + (Loc.create ~start:atom_start ~stop:atom_end, s) } | _ | eof { to_eol lexbuf; diff --git a/test/blackbox-tests/test-cases/lang-invalid-version.t b/test/blackbox-tests/test-cases/lang-invalid-version.t index d53a20c8e7c..a17d88df7df 100644 --- a/test/blackbox-tests/test-cases/lang-invalid-version.t +++ b/test/blackbox-tests/test-cases/lang-invalid-version.t @@ -24,48 +24,42 @@ atoms without needing validation in every decoder. Test with various non-ASCII characters: -CR-someday benodiwal: The version_loc is greedy and captures the closing -parenthesis. - $ test_invalid_version "è" - File "dune-project", line 1, characters 11-13: + File "dune-project", line 1, characters 11-12: 1 | (lang dune è) - ^^ + ^ Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain ASCII characters. [1] $ test_invalid_version "π3.14" - File "dune-project", line 1, characters 11-17: + File "dune-project", line 1, characters 11-12: 1 | (lang dune π3.14) - ^^^^^^ + ^ Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain ASCII characters. [1] $ test_invalid_version "α" - File "dune-project", line 1, characters 11-13: + File "dune-project", line 1, characters 11-12: 1 | (lang dune α) - ^^ + ^ Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain ASCII characters. [1] $ test_invalid_version "😀" - File "dune-project", line 1, characters 11-15: + File "dune-project", line 1, characters 11-12: 1 | (lang dune 😀) - ^^^^ + ^ Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain ASCII characters. [1] -CR-someday benodiwal: Unicode string lengths are miscomputed in location -excerpts for East Asian characters. - $ test_invalid_version "中3.16文" - File "dune-project", line 1, characters 11-21: + File "dune-project", line 1, characters 11-12: 1 | (lang dune 中3.16文) - ^^^^^^^^^^ + ^ Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain ASCII characters. [1] From 7d7ed5a3d545be2d306f2cf697b2328dcd23225c Mon Sep 17 00:00:00 2001 From: Sachin Beniwal Date: Fri, 5 Dec 2025 19:35:10 +0530 Subject: [PATCH 2/5] chore: added entry for CHANGES.md Signed-off-by: Sachin Beniwal --- doc/changes/fixed/12869.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 doc/changes/fixed/12869.md diff --git a/doc/changes/fixed/12869.md b/doc/changes/fixed/12869.md new file mode 100644 index 00000000000..cb27912341b --- /dev/null +++ b/doc/changes/fixed/12869.md @@ -0,0 +1,7 @@ +- Fix greedy version location in lang declarations. Previously, error locations for + invalid lang versions would span multiple bytes for multi-byte UTF-8 characters, + causing carets to appear misaligned and seemingly include the closing + parenthesis. Now, error locations for ASCII strings show the full length (e.g., + "Ali" shows `^^^`), while non-ASCII strings show only the first byte (e.g., "è" + shows `^`) to avoid multi-byte character display issues. (#12869, fixes #12806, + @benodiwal) From 7a369057d8627e71d2c20f31ae44ef19fcd5193d Mon Sep 17 00:00:00 2001 From: Sachin Beniwal Date: Fri, 5 Dec 2025 21:01:09 +0530 Subject: [PATCH 3/5] chore: added CR someday Signed-off-by: Sachin Beniwal --- src/dune_sexp/versioned_file_first_line.mll | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/dune_sexp/versioned_file_first_line.mll b/src/dune_sexp/versioned_file_first_line.mll index 9a8574a236c..7ab2026be95 100644 --- a/src/dune_sexp/versioned_file_first_line.mll +++ b/src/dune_sexp/versioned_file_first_line.mll @@ -33,6 +33,10 @@ and atom start = parse } | atom_char+ as s { let atom_start = Lexing.lexeme_start_p lexbuf in + (* CR-someday benodiwal: Consider using Uuseg for more accurate grapheme + cluster detection instead of the simple byte-based check. This would + better handle complex Unicode like emoji sequences, though terminal + rendering inconsistencies would still cause display issues. *) let has_non_ascii = String.exists s ~f:(fun c -> Char.code c >= 128) in let length = if has_non_ascii then 1 else String.length s in let atom_end = From 55281c21025bafe85abfe5daa6a73fb6b031ac36 Mon Sep 17 00:00:00 2001 From: Sachin Beniwal Date: Fri, 5 Dec 2025 21:30:39 +0530 Subject: [PATCH 4/5] tests: added more robust tests Signed-off-by: Sachin Beniwal --- src/dune_sexp/versioned_file_first_line.mll | 22 ++++++++---- .../test-cases/lang-invalid-version.t | 36 +++++++++++++++++++ 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/src/dune_sexp/versioned_file_first_line.mll b/src/dune_sexp/versioned_file_first_line.mll index 7ab2026be95..7ba8dd9521e 100644 --- a/src/dune_sexp/versioned_file_first_line.mll +++ b/src/dune_sexp/versioned_file_first_line.mll @@ -37,14 +37,22 @@ and atom start = parse cluster detection instead of the simple byte-based check. This would better handle complex Unicode like emoji sequences, though terminal rendering inconsistencies would still cause display issues. *) - let has_non_ascii = String.exists s ~f:(fun c -> Char.code c >= 128) in - let length = if has_non_ascii then 1 else String.length s in - let atom_end = - { atom_start with - pos_cnum = atom_start.pos_cnum + length - } + let rec find_first_non_ascii i = + if i >= String.length s then None + else if Char.code s.[i] >= 128 then Some i + else find_first_non_ascii (i + 1) in - (Loc.create ~start:atom_start ~stop:atom_end, s) + let loc_start, loc_end = + match find_first_non_ascii 0 with + | Some offset -> + let error_start = { atom_start with pos_cnum = atom_start.pos_cnum + offset } in + let error_end = { error_start with pos_cnum = error_start.pos_cnum + 1 } in + error_start, error_end + | None -> + let atom_end = { atom_start with pos_cnum = atom_start.pos_cnum + String.length s } in + atom_start, atom_end + in + (Loc.create ~start:loc_start ~stop:loc_end, s) } | _ | eof { to_eol lexbuf; diff --git a/test/blackbox-tests/test-cases/lang-invalid-version.t b/test/blackbox-tests/test-cases/lang-invalid-version.t index a17d88df7df..2bdd6af3b84 100644 --- a/test/blackbox-tests/test-cases/lang-invalid-version.t +++ b/test/blackbox-tests/test-cases/lang-invalid-version.t @@ -63,3 +63,39 @@ Test with various non-ASCII characters: Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain ASCII characters. [1] + +Test where non-ASCII character is not the first character: + + $ test_invalid_version "3.14è" + File "dune-project", line 1, characters 15-16: + 1 | (lang dune 3.14è) + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. + [1] + + $ test_invalid_version "2è3" + File "dune-project", line 1, characters 12-13: + 1 | (lang dune 2è3) + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. + [1] + + $ test_invalid_version "abc😀def" + File "dune-project", line 1, characters 14-15: + 1 | (lang dune abc😀def) + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. + [1] + +Test with multiple non-ASCII characters (caret points to first one): + + $ test_invalid_version "aè😀" + File "dune-project", line 1, characters 12-13: + 1 | (lang dune aè😀) + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. + [1] From b319dc2af2948fb42cd0c53ad20ec6d21c22e7a6 Mon Sep 17 00:00:00 2001 From: Sachin Beniwal Date: Sat, 6 Dec 2025 03:30:34 +0530 Subject: [PATCH 5/5] fix: refactor Signed-off-by: Sachin Beniwal --- src/dune_sexp/versioned_file_first_line.mll | 23 ++++++++++++--------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/dune_sexp/versioned_file_first_line.mll b/src/dune_sexp/versioned_file_first_line.mll index 7ba8dd9521e..2e5bac4a96c 100644 --- a/src/dune_sexp/versioned_file_first_line.mll +++ b/src/dune_sexp/versioned_file_first_line.mll @@ -9,11 +9,18 @@ let invalid_lang_line start lexbuf = lexbuf.Lexing.lex_start_p <- start; User_error.raise ~loc:(Loc.of_lexbuf lexbuf) [ Pp.text "Invalid first line, expected: (lang )" ] + +let rec find_first_non_ascii s i = + if i >= String.length s then None + else if Char.code s.[i] >= 128 then Some i + else find_first_non_ascii s (i + 1) } let newline = '\r'? '\n' let blank = [' ' '\t'] -let atom_char = [^';' '(' ')' '"' '#' '|' '\000'-'\032'] +let ascii = [ '\000' - '\127' ] +let atom_char = [^ ';' '(' ')' '"' '\000'-'\032' '\127'-'\255'] +let atom_char_with_junk = [^';' '(' ')' '"' '#' '|' '\000'-'\032'] | [ '\127'-'\255' ] rule lex_opt = parse | '(' blank* "lang" @@ -32,25 +39,21 @@ and atom start = parse { atom start lexbuf } | atom_char+ as s + { (Loc.of_lexbuf lexbuf, s) + } + | atom_char_with_junk+ as s { let atom_start = Lexing.lexeme_start_p lexbuf in (* CR-someday benodiwal: Consider using Uuseg for more accurate grapheme cluster detection instead of the simple byte-based check. This would better handle complex Unicode like emoji sequences, though terminal rendering inconsistencies would still cause display issues. *) - let rec find_first_non_ascii i = - if i >= String.length s then None - else if Char.code s.[i] >= 128 then Some i - else find_first_non_ascii (i + 1) - in let loc_start, loc_end = - match find_first_non_ascii 0 with + match find_first_non_ascii s 0 with + | None -> assert false | Some offset -> let error_start = { atom_start with pos_cnum = atom_start.pos_cnum + offset } in let error_end = { error_start with pos_cnum = error_start.pos_cnum + 1 } in error_start, error_end - | None -> - let atom_end = { atom_start with pos_cnum = atom_start.pos_cnum + String.length s } in - atom_start, atom_end in (Loc.create ~start:loc_start ~stop:loc_end, s) }