diff --git a/doc/changes/fixed/12869.md b/doc/changes/fixed/12869.md new file mode 100644 index 00000000000..cb27912341b --- /dev/null +++ b/doc/changes/fixed/12869.md @@ -0,0 +1,7 @@ +- Fix greedy version location in lang declarations. Previously, error locations for + invalid lang versions would span multiple bytes for multi-byte UTF-8 characters, + causing carets to appear misaligned and seemingly include the closing + parenthesis. Now, error locations for ASCII strings show the full length (e.g., + "Ali" shows `^^^`), while non-ASCII strings show only the first byte (e.g., "è" + shows `^`) to avoid multi-byte character display issues. (#12869, fixes #12806, + @benodiwal) diff --git a/src/dune_sexp/versioned_file_first_line.mll b/src/dune_sexp/versioned_file_first_line.mll index e5459b3d079..2e5bac4a96c 100644 --- a/src/dune_sexp/versioned_file_first_line.mll +++ b/src/dune_sexp/versioned_file_first_line.mll @@ -9,11 +9,18 @@ let invalid_lang_line start lexbuf = lexbuf.Lexing.lex_start_p <- start; User_error.raise ~loc:(Loc.of_lexbuf lexbuf) [ Pp.text "Invalid first line, expected: (lang )" ] + +let rec find_first_non_ascii s i = + if i >= String.length s then None + else if Char.code s.[i] >= 128 then Some i + else find_first_non_ascii s (i + 1) } let newline = '\r'? '\n' let blank = [' ' '\t'] -let atom_char = [^';' '(' ')' '"' '#' '|' '\000'-'\032'] +let ascii = [ '\000' - '\127' ] +let atom_char = [^ ';' '(' ')' '"' '\000'-'\032' '\127'-'\255'] +let atom_char_with_junk = [^';' '(' ')' '"' '#' '|' '\000'-'\032'] | [ '\127'-'\255' ] rule lex_opt = parse | '(' blank* "lang" @@ -34,6 +41,22 @@ and atom start = parse | atom_char+ as s { (Loc.of_lexbuf lexbuf, s) } + | atom_char_with_junk+ as s + { let atom_start = Lexing.lexeme_start_p lexbuf in + (* CR-someday benodiwal: Consider using Uuseg for more accurate grapheme + cluster detection instead of the simple byte-based check. This would + better handle complex Unicode like emoji sequences, though terminal + rendering inconsistencies would still cause display issues. *) + let loc_start, loc_end = + match find_first_non_ascii s 0 with + | None -> assert false + | Some offset -> + let error_start = { atom_start with pos_cnum = atom_start.pos_cnum + offset } in + let error_end = { error_start with pos_cnum = error_start.pos_cnum + 1 } in + error_start, error_end + in + (Loc.create ~start:loc_start ~stop:loc_end, s) + } | _ | eof { to_eol lexbuf; invalid_lang_line start lexbuf diff --git a/test/blackbox-tests/test-cases/lang-invalid-version.t b/test/blackbox-tests/test-cases/lang-invalid-version.t index d53a20c8e7c..2bdd6af3b84 100644 --- a/test/blackbox-tests/test-cases/lang-invalid-version.t +++ b/test/blackbox-tests/test-cases/lang-invalid-version.t @@ -24,48 +24,78 @@ atoms without needing validation in every decoder. Test with various non-ASCII characters: -CR-someday benodiwal: The version_loc is greedy and captures the closing -parenthesis. - $ test_invalid_version "è" - File "dune-project", line 1, characters 11-13: + File "dune-project", line 1, characters 11-12: 1 | (lang dune è) - ^^ + ^ Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain ASCII characters. [1] $ test_invalid_version "π3.14" - File "dune-project", line 1, characters 11-17: + File "dune-project", line 1, characters 11-12: 1 | (lang dune π3.14) - ^^^^^^ + ^ Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain ASCII characters. [1] $ test_invalid_version "α" - File "dune-project", line 1, characters 11-13: + File "dune-project", line 1, characters 11-12: 1 | (lang dune α) - ^^ + ^ Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain ASCII characters. [1] $ test_invalid_version "😀" - File "dune-project", line 1, characters 11-15: + File "dune-project", line 1, characters 11-12: 1 | (lang dune 😀) - ^^^^ + ^ Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain ASCII characters. [1] -CR-someday benodiwal: Unicode string lengths are miscomputed in location -excerpts for East Asian characters. - $ test_invalid_version "中3.16文" - File "dune-project", line 1, characters 11-21: + File "dune-project", line 1, characters 11-12: 1 | (lang dune 中3.16文) - ^^^^^^^^^^ + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. + [1] + +Test where non-ASCII character is not the first character: + + $ test_invalid_version "3.14è" + File "dune-project", line 1, characters 15-16: + 1 | (lang dune 3.14è) + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. + [1] + + $ test_invalid_version "2è3" + File "dune-project", line 1, characters 12-13: + 1 | (lang dune 2è3) + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. + [1] + + $ test_invalid_version "abc😀def" + File "dune-project", line 1, characters 14-15: + 1 | (lang dune abc😀def) + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. + [1] + +Test with multiple non-ASCII characters (caret points to first one): + + $ test_invalid_version "aè😀" + File "dune-project", line 1, characters 12-13: + 1 | (lang dune aè😀) + ^ Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain ASCII characters. [1]