diff --git a/doc/changes/fixed/12844.md b/doc/changes/fixed/12844.md new file mode 100644 index 00000000000..056d09f16a7 --- /dev/null +++ b/doc/changes/fixed/12844.md @@ -0,0 +1,7 @@ +- Improve error messages for invalid version formats containing non-ASCII + characters. Previously, non-ASCII characters in version strings (e.g., `(lang + dune è)` or `(using menhir π3.14)`) would fail with a generic "Invalid file" + error. Now they display a clear message: "Invalid atom: contains non-ASCII + character(s). Atoms must only contain ASCII characters." The fix is + implemented at the lexer level, providing consistent error handling across all + s-expression parsing. (#12844, fixes #12836, @benodiwal) diff --git a/src/dune_sexp/atom.ml b/src/dune_sexp/atom.ml index aae42b69b7a..8b2c45a9e59 100644 --- a/src/dune_sexp/atom.ml +++ b/src/dune_sexp/atom.ml @@ -9,6 +9,11 @@ let to_dyn (A s) = let equal (A a) (A b) = String.equal a b +let non_ascii_error_message = + "Invalid atom: contains non-ASCII character(s). Atoms must only contain ASCII \ + characters." +;; + let is_valid = let rec loop s i len = i = len diff --git a/src/dune_sexp/atom.mli b/src/dune_sexp/atom.mli index 982a4f224a6..19547746f0e 100644 --- a/src/dune_sexp/atom.mli +++ b/src/dune_sexp/atom.mli @@ -1,6 +1,10 @@ type t = private A of string [@@unboxed] val equal : t -> t -> bool + +(** Error message for atoms containing non-ASCII characters *) +val non_ascii_error_message : string + val is_valid : string -> bool val of_string : string -> t val to_string : t -> string diff --git a/src/dune_sexp/lexer.mll b/src/dune_sexp/lexer.mll index 0bc4db1909e..d2c91ea6088 100644 --- a/src/dune_sexp/lexer.mll +++ b/src/dune_sexp/lexer.mll @@ -182,6 +182,8 @@ and atom acc start = parse { atom ((template_variable lexbuf) :: acc) start lexbuf } | "%" { atom (Template.add_text acc "%") start lexbuf } + | ['\128'-'\255'] + { error lexbuf Atom.non_ascii_error_message } | "" { Template.token acc ~quoted:false ~start lexbuf } diff --git a/src/dune_sexp/versioned_file.ml b/src/dune_sexp/versioned_file.ml index 29a1da42fd1..8505da59c27 100644 --- a/src/dune_sexp/versioned_file.ml +++ b/src/dune_sexp/versioned_file.ml @@ -61,17 +61,14 @@ struct match Atom.parse ver with | Some atom -> atom | None -> - let hints = - match Table.find langs name with - | None -> [] - | Some lang -> - let latest = Syntax.greatest_supported_version_exn lang.syntax in - [ Pp.textf "lang dune %s" (Syntax.Version.to_string latest) ] - in - User_error.raise - ~loc:ver_loc - ~hints - [ Pp.text "Invalid version. Version must be two numbers separated by a dot." ] + let has_non_ascii = String.exists ver ~f:(fun c -> Char.code c >= 128) in + if has_non_ascii + then User_error.raise ~loc:ver_loc [ Pp.text Atom.non_ascii_error_message ] + else + User_error.raise + ~loc:ver_loc + [ Pp.text "Invalid version. Version must be two numbers separated by a dot." + ] in let dune_lang_ver = Decoder.parse Syntax.Version.decode Univ_map.empty (Atom (ver_loc, ver_atom)) diff --git a/test/blackbox-tests/test-cases/extensions-invalid-version.t b/test/blackbox-tests/test-cases/extensions-invalid-version.t index 75816f47c9e..f7fff45a314 100644 --- a/test/blackbox-tests/test-cases/extensions-invalid-version.t +++ b/test/blackbox-tests/test-cases/extensions-invalid-version.t @@ -11,6 +11,11 @@ such situations provide a clear error. Invalid version number: +CR-someday benodiwal: Consider adding context-specific hints for ASCII invalid +versions (e.g., "Hint: using menhir 3.0"). The current lexer-level approach +trades hints for simplicity and robustness - it works uniformly across all +atoms without needing validation in every decoder. + $ test_invalid_version "Ali" File "dune-project", line 2, characters 14-17: 2 | (using menhir Ali) @@ -20,47 +25,111 @@ Invalid version number: Test with various non-ASCII characters: -CR-someday benodiwal: Non-ASCII characters in extension versions fail at the -s-expression parsing level, showing a generic "Invalid dune-project file" error -instead of the specific version validation error with hints. This would require -changes to the s-expression parser to handle properly. - $ test_invalid_version "è" - File "dune-project", line 2, characters 14-14: + File "dune-project", line 2, characters 14-15: 2 | (using menhir è) - - Error: Invalid dune-project file + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] $ test_invalid_version "π3.14" - File "dune-project", line 2, characters 14-14: + File "dune-project", line 2, characters 14-15: 2 | (using menhir π3.14) - - Error: Invalid dune-project file + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] $ test_invalid_version "α" - File "dune-project", line 2, characters 14-14: + File "dune-project", line 2, characters 14-15: 2 | (using menhir α) - - Error: Invalid dune-project file + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] $ test_invalid_version "😀" - File "dune-project", line 2, characters 14-14: + File "dune-project", line 2, characters 14-15: 2 | (using menhir 😀) - - Error: Invalid dune-project file + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] - $ test_invalid_version "中3.16文" - File "dune-project", line 2, characters 14-14: + File "dune-project", line 2, characters 14-15: 2 | (using menhir 中3.16文) - - Error: Invalid dune-project file + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. + [1] + +Test with multiple extensions: + + $ cat > dune-project < (lang dune 3.21) + > (using menhir 3.0) + > (using melange 0.1) + > EOF + $ dune build + +Multiple extensions with one invalid (ASCII): + + $ cat > dune-project < (lang dune 3.21) + > (using menhir 3.0) + > (using melange invalid) + > EOF + $ dune build + File "dune-project", line 3, characters 15-22: + 3 | (using melange invalid) + ^^^^^^^ + Error: Invalid version. Version must be two numbers separated by a dot. [1] +Multiple extensions with one invalid (non-ASCII): + + $ cat > dune-project < (lang dune 3.21) + > (using menhir 3.0) + > (using melange 😀) + > EOF + $ dune build + File "dune-project", line 3, characters 15-16: + 3 | (using melange 😀) + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. + [1] + +Multiple extensions with first one invalid: + + $ cat > dune-project < (lang dune 3.21) + > (using menhir bad) + > (using melange 0.1) + > EOF + $ dune build + File "dune-project", line 2, characters 14-17: + 2 | (using menhir bad) + ^^^ + Error: Invalid version. Version must be two numbers separated by a dot. + [1] + +Multiple extensions both invalid: + + $ cat > dune-project < (lang dune 3.21) + > (using menhir abc) + > (using melange xyz) + > EOF + $ dune build + File "dune-project", line 3, characters 15-18: + 3 | (using melange xyz) + ^^^ + Error: Invalid version. Version must be two numbers separated by a dot. + [1] diff --git a/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t b/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t index 7b444ba7d75..b1dcaab44e2 100644 --- a/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t +++ b/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t @@ -25,8 +25,9 @@ Utf8 characters are handled for now, this is also related to the issue #9728 $ dune format-dune-file < (Écho "hello") > EOF - File "", line 1, characters 1-1: - Error: Invalid . file + File "", line 1, characters 1-2: + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] $ bash -c "printf '(echo \"%b\")' '\xc0'"| dune format-dune-file diff --git a/test/blackbox-tests/test-cases/lang-invalid-version.t b/test/blackbox-tests/test-cases/lang-invalid-version.t index bc282f752b9..d53a20c8e7c 100644 --- a/test/blackbox-tests/test-cases/lang-invalid-version.t +++ b/test/blackbox-tests/test-cases/lang-invalid-version.t @@ -10,6 +10,11 @@ such situations provide a clear error. Invalid version number: +CR-someday benodiwal: Consider adding context-specific hints for ASCII invalid +versions (e.g., "Hint: lang dune 3.21"). The current lexer-level approach +trades hints for simplicity and robustness - it works uniformly across all +atoms without needing validation in every decoder. + $ test_invalid_version "Ali" File "dune-project", line 1, characters 11-14: 1 | (lang dune Ali) @@ -26,32 +31,32 @@ parenthesis. File "dune-project", line 1, characters 11-13: 1 | (lang dune è) ^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: lang dune 3.21 + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] $ test_invalid_version "π3.14" File "dune-project", line 1, characters 11-17: 1 | (lang dune π3.14) ^^^^^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: lang dune 3.21 + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] $ test_invalid_version "α" File "dune-project", line 1, characters 11-13: 1 | (lang dune α) ^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: lang dune 3.21 + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] $ test_invalid_version "😀" File "dune-project", line 1, characters 11-15: 1 | (lang dune 😀) ^^^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: lang dune 3.21 + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] CR-someday benodiwal: Unicode string lengths are miscomputed in location @@ -61,6 +66,6 @@ excerpts for East Asian characters. File "dune-project", line 1, characters 11-21: 1 | (lang dune 中3.16文) ^^^^^^^^^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: lang dune 3.21 + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1]