From 1c632b1610f431b22ed6097b64af190d59914c1a Mon Sep 17 00:00:00 2001 From: Sachin Beniwal Date: Mon, 1 Dec 2025 23:57:21 +0530 Subject: [PATCH 01/14] feat: added logic for handling non ascii cases for invalid extensions versions Signed-off-by: Sachin Beniwal --- src/dune_lang/dune | 2 +- src/dune_lang/dune_project.ml | 44 +++++++++- src/dune_lang/using_declaration_parser.mll | 86 +++++++++++++++++++ .../test-cases/extensions-invalid-version.t | 41 +++++---- 4 files changed, 147 insertions(+), 26 deletions(-) create mode 100644 src/dune_lang/using_declaration_parser.mll diff --git a/src/dune_lang/dune b/src/dune_lang/dune index 5639fe8ba10..9ee614a2ebb 100644 --- a/src/dune_lang/dune +++ b/src/dune_lang/dune @@ -22,4 +22,4 @@ dune_section (re_export dune_sexp))) -(ocamllex dune_file_script) +(ocamllex dune_file_script using_declaration_parser) diff --git a/src/dune_lang/dune_project.ml b/src/dune_lang/dune_project.ml index e7653c646c4..b913e6629d9 100644 --- a/src/dune_lang/dune_project.ml +++ b/src/dune_lang/dune_project.ml @@ -288,6 +288,14 @@ module Extension = struct () ;; + let find name = Table.find extensions name + + let get_syntax = function + | Extension (Packed e) -> e.syntax + | Deleted_in _ -> + Code_error.raise "Extension.get_syntax called on deleted extension" [] + ;; + let instantiate ~dune_lang_ver ~loc ~parse_args (name_loc, name) (ver_loc, ver) = match Table.find extensions name with | None -> @@ -1078,13 +1086,41 @@ let parse ~dir ~(lang : Lang.Instance.t) ~file = } ;; +let validate_using_declarations contents fname = + (* Scan and validate (using ...) declarations before s-expression parsing *) + let using_decls = Using_declaration_parser.scan contents fname in + List.iter using_decls ~f:(fun { Using_declaration_parser.name; version } -> + let _name_loc, name_str = name in + let ver_loc, ver_str = version in + (* Check if version contains non-ASCII or is invalid format *) + let has_non_ascii = String.exists ver_str ~f:(fun c -> Char.code c > 127) in + let is_valid_format = + match Scanf.sscanf ver_str "%u.%u%!" (fun a b -> a, b) with + | _ -> true + | exception _ -> false + in + if has_non_ascii || not is_valid_format + then ( + let hints = + match Extension.find name_str with + | None -> [] + | Some ext -> + let latest = Syntax.greatest_supported_version_exn (Extension.get_syntax ext) in + [ Pp.textf "using %s %s" name_str (Syntax.Version.to_string latest) ] + in + User_error.raise + ~loc:ver_loc + ~hints + [ Pp.text "Invalid version. Version must be two numbers separated by a dot." ])) +;; + let load_dune_project ~read ~dir opam_packages : t Memo.t = let file = Path.Source.relative dir filename in let open Memo.O in - let* lexbuf = - let+ contents = read file in - Lexbuf.from_string contents ~fname:(Path.Source.to_string file) - in + let* contents = read file in + let fname = Path.Source.to_string file in + validate_using_declarations contents fname; + let lexbuf = Lexbuf.from_string contents ~fname in parse_contents lexbuf ~f:(fun lang -> parse ~dir ~lang ~file) opam_packages ;; diff --git a/src/dune_lang/using_declaration_parser.mll b/src/dune_lang/using_declaration_parser.mll new file mode 100644 index 00000000000..5342cab25ae --- /dev/null +++ b/src/dune_lang/using_declaration_parser.mll @@ -0,0 +1,86 @@ +{ +open Stdune + +type using_decl = + { name : Loc.t * string + ; version : Loc.t * string + } +} + +let newline = '\r'? '\n' +let blank = [' ' '\t'] +(* Accept any non-whitespace, non-paren characters for version - including non-ASCII *) +let version_char = [^ ' ' '\t' '\r' '\n' '(' ')'] + +rule scan_all acc = parse + | '(' blank* "using" + { let start_pos = Lexing.lexeme_start_p lexbuf in + match parse_using_decl start_pos lexbuf with + | Some decl -> scan_all (decl :: acc) lexbuf + | None -> scan_all acc lexbuf + } + | newline + { Lexing.new_line lexbuf; + scan_all acc lexbuf + } + | _ + { scan_all acc lexbuf + } + | eof + { acc + } + +and parse_using_decl start_pos = parse + | blank+ + { parse_using_decl start_pos lexbuf + } + | ([^ ' ' '\t' '\r' '\n' '(' ')']+ as name) blank+ + { let name_loc = Loc.of_lexbuf lexbuf in + match parse_version start_pos lexbuf with + | Some (ver_loc, ver) -> + Some { name = (name_loc, name); version = (ver_loc, ver) } + | None -> None + } + | _ + { skip_to_closing_paren lexbuf; + None + } + +and parse_version start_pos = parse + | blank+ + { parse_version start_pos lexbuf + } + | (version_char+ as ver) blank* ')' + { let ver_loc = Loc.create + ~start:lexbuf.Lexing.lex_start_p + ~stop:{ lexbuf.Lexing.lex_curr_p with + pos_cnum = lexbuf.Lexing.lex_curr_p.pos_cnum - 1 } in + Some (ver_loc, ver) + } + | _ + { skip_to_closing_paren lexbuf; + None + } + +and skip_to_closing_paren = parse + | ')' + { () + } + | newline + { Lexing.new_line lexbuf; + skip_to_closing_paren lexbuf + } + | _ + { skip_to_closing_paren lexbuf + } + | eof + { () + } + +{ + (* Entry point: scan file and return all (using ...) declarations *) + let scan contents fname = + let lexbuf = Lexing.from_string contents in + lexbuf.Lexing.lex_curr_p <- { lexbuf.Lexing.lex_curr_p with pos_fname = fname }; + scan_all [] lexbuf |> List.rev +} diff --git a/test/blackbox-tests/test-cases/extensions-invalid-version.t b/test/blackbox-tests/test-cases/extensions-invalid-version.t index 75816f47c9e..2e014ffd783 100644 --- a/test/blackbox-tests/test-cases/extensions-invalid-version.t +++ b/test/blackbox-tests/test-cases/extensions-invalid-version.t @@ -20,47 +20,46 @@ Invalid version number: Test with various non-ASCII characters: -CR-someday benodiwal: Non-ASCII characters in extension versions fail at the -s-expression parsing level, showing a generic "Invalid dune-project file" error -instead of the specific version validation error with hints. This would require -changes to the s-expression parser to handle properly. - $ test_invalid_version "è" - File "dune-project", line 2, characters 14-14: + File "dune-project", line 2, characters 14-16: 2 | (using menhir è) - - Error: Invalid dune-project file + ^^ + Error: Invalid version. Version must be two numbers separated by a dot. + Hint: using menhir 3.0 [1] $ test_invalid_version "π3.14" - File "dune-project", line 2, characters 14-14: + File "dune-project", line 2, characters 14-20: 2 | (using menhir π3.14) - - Error: Invalid dune-project file + ^^^^^^ + Error: Invalid version. Version must be two numbers separated by a dot. + Hint: using menhir 3.0 [1] $ test_invalid_version "α" - File "dune-project", line 2, characters 14-14: + File "dune-project", line 2, characters 14-16: 2 | (using menhir α) - - Error: Invalid dune-project file + ^^ + Error: Invalid version. Version must be two numbers separated by a dot. + Hint: using menhir 3.0 [1] $ test_invalid_version "😀" - File "dune-project", line 2, characters 14-14: + File "dune-project", line 2, characters 14-18: 2 | (using menhir 😀) - - Error: Invalid dune-project file + ^^^^ + Error: Invalid version. Version must be two numbers separated by a dot. + Hint: using menhir 3.0 [1] $ test_invalid_version "中3.16文" - File "dune-project", line 2, characters 14-14: + File "dune-project", line 2, characters 14-24: 2 | (using menhir 中3.16文) - - Error: Invalid dune-project file + ^^^^^^^^^^ + Error: Invalid version. Version must be two numbers separated by a dot. + Hint: using menhir 3.0 [1] - From 2ccf121c3dce4a3363c0bc76332002f973764b38 Mon Sep 17 00:00:00 2001 From: Sachin Beniwal Date: Tue, 2 Dec 2025 13:51:20 +0530 Subject: [PATCH 02/14] fix: valid format logic Signed-off-by: Sachin Beniwal --- src/dune_lang/dune_project.ml | 8 +++++--- .../test-cases/extensions-invalid-version.t | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/dune_lang/dune_project.ml b/src/dune_lang/dune_project.ml index b913e6629d9..0b28a8f4dde 100644 --- a/src/dune_lang/dune_project.ml +++ b/src/dune_lang/dune_project.ml @@ -1095,9 +1095,11 @@ let validate_using_declarations contents fname = (* Check if version contains non-ASCII or is invalid format *) let has_non_ascii = String.exists ver_str ~f:(fun c -> Char.code c > 127) in let is_valid_format = - match Scanf.sscanf ver_str "%u.%u%!" (fun a b -> a, b) with - | _ -> true - | exception _ -> false + (* Use the same validation logic as Syntax.Version.decode *) + match Scanf.sscanf ver_str "%u.%u%s" (fun a b s -> (a, b), s) with + | Ok (_, "") -> true + | Ok (_, _) -> true (* has trailing chars, but base format is valid *) + | Error () -> false in if has_non_ascii || not is_valid_format then ( diff --git a/test/blackbox-tests/test-cases/extensions-invalid-version.t b/test/blackbox-tests/test-cases/extensions-invalid-version.t index 2e014ffd783..a3dec25d801 100644 --- a/test/blackbox-tests/test-cases/extensions-invalid-version.t +++ b/test/blackbox-tests/test-cases/extensions-invalid-version.t @@ -16,6 +16,7 @@ Invalid version number: 2 | (using menhir Ali) ^^^ Error: Invalid version. Version must be two numbers separated by a dot. + Hint: using menhir 3.0 [1] Test with various non-ASCII characters: From 025b55ab452745461df61f176d24eaaec91a6384 Mon Sep 17 00:00:00 2001 From: Sachin Beniwal Date: Tue, 2 Dec 2025 14:32:17 +0530 Subject: [PATCH 03/14] fix: refactored code Signed-off-by: Sachin Beniwal --- src/dune_lang/dune_project.ml | 15 +++------------ src/dune_lang/using_declaration.ml | 26 ++++++++++++++++++++++++++ src/dune_lang/using_declaration.mli | 18 ++++++++++++++++++ 3 files changed, 47 insertions(+), 12 deletions(-) create mode 100644 src/dune_lang/using_declaration.ml create mode 100644 src/dune_lang/using_declaration.mli diff --git a/src/dune_lang/dune_project.ml b/src/dune_lang/dune_project.ml index 0b28a8f4dde..0f5c6b60680 100644 --- a/src/dune_lang/dune_project.ml +++ b/src/dune_lang/dune_project.ml @@ -1088,20 +1088,11 @@ let parse ~dir ~(lang : Lang.Instance.t) ~file = let validate_using_declarations contents fname = (* Scan and validate (using ...) declarations before s-expression parsing *) - let using_decls = Using_declaration_parser.scan contents fname in - List.iter using_decls ~f:(fun { Using_declaration_parser.name; version } -> + let using_decls = Using_declaration.scan contents fname in + List.iter using_decls ~f:(fun { Using_declaration.name; version } -> let _name_loc, name_str = name in let ver_loc, ver_str = version in - (* Check if version contains non-ASCII or is invalid format *) - let has_non_ascii = String.exists ver_str ~f:(fun c -> Char.code c > 127) in - let is_valid_format = - (* Use the same validation logic as Syntax.Version.decode *) - match Scanf.sscanf ver_str "%u.%u%s" (fun a b s -> (a, b), s) with - | Ok (_, "") -> true - | Ok (_, _) -> true (* has trailing chars, but base format is valid *) - | Error () -> false - in - if has_non_ascii || not is_valid_format + if Using_declaration.is_invalid_version ver_str then ( let hints = match Extension.find name_str with diff --git a/src/dune_lang/using_declaration.ml b/src/dune_lang/using_declaration.ml new file mode 100644 index 00000000000..d9d89682298 --- /dev/null +++ b/src/dune_lang/using_declaration.ml @@ -0,0 +1,26 @@ +open Import + +(* Re-export the type from the parser *) +type using_decl = Using_declaration_parser.using_decl = + { name : Loc.t * string + ; version : Loc.t * string + } + +(* Scan file contents for (using ...) declarations *) +let scan contents fname = Using_declaration_parser.scan contents fname + +(* Check if a string contains non-ASCII characters *) +let has_non_ascii s = String.exists s ~f:(fun c -> Char.code c > 127) + +(* Check if a version string has valid format (X.Y optionally followed by extra chars) *) +let is_valid_version_format ver_str = + match Scanf.sscanf ver_str "%u.%u%s" (fun a b s -> (a, b), s) with + | Ok (_, "") -> true + | Ok (_, _) -> true (* has trailing chars, but base format is valid *) + | Error () -> false +;; + +(* Validate a version string - returns true if invalid (needs error) *) +let is_invalid_version ver_str = + has_non_ascii ver_str || not (is_valid_version_format ver_str) +;; diff --git a/src/dune_lang/using_declaration.mli b/src/dune_lang/using_declaration.mli new file mode 100644 index 00000000000..9be2110330b --- /dev/null +++ b/src/dune_lang/using_declaration.mli @@ -0,0 +1,18 @@ +open Stdune + +type using_decl = + { name : Loc.t * string + ; version : Loc.t * string + } + +(** Scan file contents for (using ...) declarations *) +val scan : string -> string -> using_decl list + +(** Check if a string contains non-ASCII characters *) +val has_non_ascii : string -> bool + +(** Check if a version string has valid format (X.Y optionally followed by extra chars) *) +val is_valid_version_format : string -> bool + +(** Validate a version string - returns true if invalid (contains non-ASCII or wrong format) *) +val is_invalid_version : string -> bool From 2c6da6c982122f4002726aaecc40e4a80ab4748b Mon Sep 17 00:00:00 2001 From: Sachin Beniwal Date: Tue, 2 Dec 2025 14:35:59 +0530 Subject: [PATCH 04/14] tests: added tests for multiple extensions cases Signed-off-by: Sachin Beniwal --- .../test-cases/extensions-invalid-version.t | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/test/blackbox-tests/test-cases/extensions-invalid-version.t b/test/blackbox-tests/test-cases/extensions-invalid-version.t index a3dec25d801..71be8818008 100644 --- a/test/blackbox-tests/test-cases/extensions-invalid-version.t +++ b/test/blackbox-tests/test-cases/extensions-invalid-version.t @@ -64,3 +64,72 @@ Test with various non-ASCII characters: Error: Invalid version. Version must be two numbers separated by a dot. Hint: using menhir 3.0 [1] + +Test with multiple extensions: + + $ cat > dune-project < (lang dune 3.21) + > (using menhir 3.0) + > (using melange 0.1) + > EOF + $ dune build + +Multiple extensions with one invalid (ASCII): + + $ cat > dune-project < (lang dune 3.21) + > (using menhir 3.0) + > (using melange invalid) + > EOF + $ dune build + File "dune-project", line 3, characters 15-22: + 3 | (using melange invalid) + ^^^^^^^ + Error: Invalid version. Version must be two numbers separated by a dot. + Hint: using melange 1.0 + [1] + +Multiple extensions with one invalid (non-ASCII): + + $ cat > dune-project < (lang dune 3.21) + > (using menhir 3.0) + > (using melange 😀) + > EOF + $ dune build + File "dune-project", line 3, characters 15-19: + 3 | (using melange 😀) + ^^^^ + Error: Invalid version. Version must be two numbers separated by a dot. + Hint: using melange 1.0 + [1] + +Multiple extensions with first one invalid: + + $ cat > dune-project < (lang dune 3.21) + > (using menhir bad) + > (using melange 0.1) + > EOF + $ dune build + File "dune-project", line 2, characters 14-17: + 2 | (using menhir bad) + ^^^ + Error: Invalid version. Version must be two numbers separated by a dot. + Hint: using menhir 3.0 + [1] + +Multiple extensions both invalid: + + $ cat > dune-project < (lang dune 3.21) + > (using menhir abc) + > (using melange xyz) + > EOF + $ dune build + File "dune-project", line 2, characters 14-17: + 2 | (using menhir abc) + ^^^ + Error: Invalid version. Version must be two numbers separated by a dot. + Hint: using menhir 3.0 + [1] From 8376b405d8d43f0cbc3a32d3f49e66898741f95a Mon Sep 17 00:00:00 2001 From: Sachin Beniwal Date: Tue, 2 Dec 2025 14:38:09 +0530 Subject: [PATCH 05/14] chore: CR someday Signed-off-by: Sachin Beniwal --- test/blackbox-tests/test-cases/extensions-invalid-version.t | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/blackbox-tests/test-cases/extensions-invalid-version.t b/test/blackbox-tests/test-cases/extensions-invalid-version.t index 71be8818008..69e43d00ace 100644 --- a/test/blackbox-tests/test-cases/extensions-invalid-version.t +++ b/test/blackbox-tests/test-cases/extensions-invalid-version.t @@ -21,6 +21,9 @@ Invalid version number: Test with various non-ASCII characters: +CR-someday benodiwal: The version_loc is greedy and captures the closing +parenthesis. + $ test_invalid_version "è" File "dune-project", line 2, characters 14-16: 2 | (using menhir è) @@ -56,6 +59,8 @@ Test with various non-ASCII characters: Hint: using menhir 3.0 [1] +CR-someday benodiwal: Unicode string lengths are miscomputed in location +excerpts for East Asian characters. $ test_invalid_version "中3.16文" File "dune-project", line 2, characters 14-24: From 2b635f1a37e42a1983e06262aef3e19dc331879e Mon Sep 17 00:00:00 2001 From: Sachin Beniwal Date: Wed, 3 Dec 2025 03:02:22 +0530 Subject: [PATCH 06/14] fix: reverted changes, kept the test cases Signed-off-by: Sachin Beniwal --- src/dune_lang/dune | 2 +- src/dune_lang/dune_project.ml | 37 +------- src/dune_lang/using_declaration.ml | 26 ------ src/dune_lang/using_declaration.mli | 18 ---- src/dune_lang/using_declaration_parser.mll | 86 ------------------- .../test-cases/extensions-invalid-version.t | 52 +++++------ 6 files changed, 26 insertions(+), 195 deletions(-) delete mode 100644 src/dune_lang/using_declaration.ml delete mode 100644 src/dune_lang/using_declaration.mli delete mode 100644 src/dune_lang/using_declaration_parser.mll diff --git a/src/dune_lang/dune b/src/dune_lang/dune index 9ee614a2ebb..5639fe8ba10 100644 --- a/src/dune_lang/dune +++ b/src/dune_lang/dune @@ -22,4 +22,4 @@ dune_section (re_export dune_sexp))) -(ocamllex dune_file_script using_declaration_parser) +(ocamllex dune_file_script) diff --git a/src/dune_lang/dune_project.ml b/src/dune_lang/dune_project.ml index 0f5c6b60680..e7653c646c4 100644 --- a/src/dune_lang/dune_project.ml +++ b/src/dune_lang/dune_project.ml @@ -288,14 +288,6 @@ module Extension = struct () ;; - let find name = Table.find extensions name - - let get_syntax = function - | Extension (Packed e) -> e.syntax - | Deleted_in _ -> - Code_error.raise "Extension.get_syntax called on deleted extension" [] - ;; - let instantiate ~dune_lang_ver ~loc ~parse_args (name_loc, name) (ver_loc, ver) = match Table.find extensions name with | None -> @@ -1086,34 +1078,13 @@ let parse ~dir ~(lang : Lang.Instance.t) ~file = } ;; -let validate_using_declarations contents fname = - (* Scan and validate (using ...) declarations before s-expression parsing *) - let using_decls = Using_declaration.scan contents fname in - List.iter using_decls ~f:(fun { Using_declaration.name; version } -> - let _name_loc, name_str = name in - let ver_loc, ver_str = version in - if Using_declaration.is_invalid_version ver_str - then ( - let hints = - match Extension.find name_str with - | None -> [] - | Some ext -> - let latest = Syntax.greatest_supported_version_exn (Extension.get_syntax ext) in - [ Pp.textf "using %s %s" name_str (Syntax.Version.to_string latest) ] - in - User_error.raise - ~loc:ver_loc - ~hints - [ Pp.text "Invalid version. Version must be two numbers separated by a dot." ])) -;; - let load_dune_project ~read ~dir opam_packages : t Memo.t = let file = Path.Source.relative dir filename in let open Memo.O in - let* contents = read file in - let fname = Path.Source.to_string file in - validate_using_declarations contents fname; - let lexbuf = Lexbuf.from_string contents ~fname in + let* lexbuf = + let+ contents = read file in + Lexbuf.from_string contents ~fname:(Path.Source.to_string file) + in parse_contents lexbuf ~f:(fun lang -> parse ~dir ~lang ~file) opam_packages ;; diff --git a/src/dune_lang/using_declaration.ml b/src/dune_lang/using_declaration.ml deleted file mode 100644 index d9d89682298..00000000000 --- a/src/dune_lang/using_declaration.ml +++ /dev/null @@ -1,26 +0,0 @@ -open Import - -(* Re-export the type from the parser *) -type using_decl = Using_declaration_parser.using_decl = - { name : Loc.t * string - ; version : Loc.t * string - } - -(* Scan file contents for (using ...) declarations *) -let scan contents fname = Using_declaration_parser.scan contents fname - -(* Check if a string contains non-ASCII characters *) -let has_non_ascii s = String.exists s ~f:(fun c -> Char.code c > 127) - -(* Check if a version string has valid format (X.Y optionally followed by extra chars) *) -let is_valid_version_format ver_str = - match Scanf.sscanf ver_str "%u.%u%s" (fun a b s -> (a, b), s) with - | Ok (_, "") -> true - | Ok (_, _) -> true (* has trailing chars, but base format is valid *) - | Error () -> false -;; - -(* Validate a version string - returns true if invalid (needs error) *) -let is_invalid_version ver_str = - has_non_ascii ver_str || not (is_valid_version_format ver_str) -;; diff --git a/src/dune_lang/using_declaration.mli b/src/dune_lang/using_declaration.mli deleted file mode 100644 index 9be2110330b..00000000000 --- a/src/dune_lang/using_declaration.mli +++ /dev/null @@ -1,18 +0,0 @@ -open Stdune - -type using_decl = - { name : Loc.t * string - ; version : Loc.t * string - } - -(** Scan file contents for (using ...) declarations *) -val scan : string -> string -> using_decl list - -(** Check if a string contains non-ASCII characters *) -val has_non_ascii : string -> bool - -(** Check if a version string has valid format (X.Y optionally followed by extra chars) *) -val is_valid_version_format : string -> bool - -(** Validate a version string - returns true if invalid (contains non-ASCII or wrong format) *) -val is_invalid_version : string -> bool diff --git a/src/dune_lang/using_declaration_parser.mll b/src/dune_lang/using_declaration_parser.mll deleted file mode 100644 index 5342cab25ae..00000000000 --- a/src/dune_lang/using_declaration_parser.mll +++ /dev/null @@ -1,86 +0,0 @@ -{ -open Stdune - -type using_decl = - { name : Loc.t * string - ; version : Loc.t * string - } -} - -let newline = '\r'? '\n' -let blank = [' ' '\t'] -(* Accept any non-whitespace, non-paren characters for version - including non-ASCII *) -let version_char = [^ ' ' '\t' '\r' '\n' '(' ')'] - -rule scan_all acc = parse - | '(' blank* "using" - { let start_pos = Lexing.lexeme_start_p lexbuf in - match parse_using_decl start_pos lexbuf with - | Some decl -> scan_all (decl :: acc) lexbuf - | None -> scan_all acc lexbuf - } - | newline - { Lexing.new_line lexbuf; - scan_all acc lexbuf - } - | _ - { scan_all acc lexbuf - } - | eof - { acc - } - -and parse_using_decl start_pos = parse - | blank+ - { parse_using_decl start_pos lexbuf - } - | ([^ ' ' '\t' '\r' '\n' '(' ')']+ as name) blank+ - { let name_loc = Loc.of_lexbuf lexbuf in - match parse_version start_pos lexbuf with - | Some (ver_loc, ver) -> - Some { name = (name_loc, name); version = (ver_loc, ver) } - | None -> None - } - | _ - { skip_to_closing_paren lexbuf; - None - } - -and parse_version start_pos = parse - | blank+ - { parse_version start_pos lexbuf - } - | (version_char+ as ver) blank* ')' - { let ver_loc = Loc.create - ~start:lexbuf.Lexing.lex_start_p - ~stop:{ lexbuf.Lexing.lex_curr_p with - pos_cnum = lexbuf.Lexing.lex_curr_p.pos_cnum - 1 } in - Some (ver_loc, ver) - } - | _ - { skip_to_closing_paren lexbuf; - None - } - -and skip_to_closing_paren = parse - | ')' - { () - } - | newline - { Lexing.new_line lexbuf; - skip_to_closing_paren lexbuf - } - | _ - { skip_to_closing_paren lexbuf - } - | eof - { () - } - -{ - (* Entry point: scan file and return all (using ...) declarations *) - let scan contents fname = - let lexbuf = Lexing.from_string contents in - lexbuf.Lexing.lex_curr_p <- { lexbuf.Lexing.lex_curr_p with pos_fname = fname }; - scan_all [] lexbuf |> List.rev -} diff --git a/test/blackbox-tests/test-cases/extensions-invalid-version.t b/test/blackbox-tests/test-cases/extensions-invalid-version.t index 69e43d00ace..56f6abaa257 100644 --- a/test/blackbox-tests/test-cases/extensions-invalid-version.t +++ b/test/blackbox-tests/test-cases/extensions-invalid-version.t @@ -16,7 +16,6 @@ Invalid version number: 2 | (using menhir Ali) ^^^ Error: Invalid version. Version must be two numbers separated by a dot. - Hint: using menhir 3.0 [1] Test with various non-ASCII characters: @@ -25,49 +24,44 @@ CR-someday benodiwal: The version_loc is greedy and captures the closing parenthesis. $ test_invalid_version "è" - File "dune-project", line 2, characters 14-16: + File "dune-project", line 2, characters 14-14: 2 | (using menhir è) - ^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: using menhir 3.0 + + Error: Invalid dune-project file [1] $ test_invalid_version "π3.14" - File "dune-project", line 2, characters 14-20: + File "dune-project", line 2, characters 14-14: 2 | (using menhir π3.14) - ^^^^^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: using menhir 3.0 + + Error: Invalid dune-project file [1] $ test_invalid_version "α" - File "dune-project", line 2, characters 14-16: + File "dune-project", line 2, characters 14-14: 2 | (using menhir α) - ^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: using menhir 3.0 + + Error: Invalid dune-project file [1] $ test_invalid_version "😀" - File "dune-project", line 2, characters 14-18: + File "dune-project", line 2, characters 14-14: 2 | (using menhir 😀) - ^^^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: using menhir 3.0 + + Error: Invalid dune-project file [1] CR-someday benodiwal: Unicode string lengths are miscomputed in location excerpts for East Asian characters. $ test_invalid_version "中3.16文" - File "dune-project", line 2, characters 14-24: + File "dune-project", line 2, characters 14-14: 2 | (using menhir 中3.16文) - ^^^^^^^^^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: using menhir 3.0 + + Error: Invalid dune-project file [1] Test with multiple extensions: @@ -91,7 +85,6 @@ Multiple extensions with one invalid (ASCII): 3 | (using melange invalid) ^^^^^^^ Error: Invalid version. Version must be two numbers separated by a dot. - Hint: using melange 1.0 [1] Multiple extensions with one invalid (non-ASCII): @@ -102,11 +95,10 @@ Multiple extensions with one invalid (non-ASCII): > (using melange 😀) > EOF $ dune build - File "dune-project", line 3, characters 15-19: + File "dune-project", line 3, characters 15-15: 3 | (using melange 😀) - ^^^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: using melange 1.0 + + Error: Invalid dune-project file [1] Multiple extensions with first one invalid: @@ -121,7 +113,6 @@ Multiple extensions with first one invalid: 2 | (using menhir bad) ^^^ Error: Invalid version. Version must be two numbers separated by a dot. - Hint: using menhir 3.0 [1] Multiple extensions both invalid: @@ -132,9 +123,8 @@ Multiple extensions both invalid: > (using melange xyz) > EOF $ dune build - File "dune-project", line 2, characters 14-17: - 2 | (using menhir abc) - ^^^ + File "dune-project", line 3, characters 15-18: + 3 | (using melange xyz) + ^^^ Error: Invalid version. Version must be two numbers separated by a dot. - Hint: using menhir 3.0 [1] From 718357c7b0b5e1b827982165b90876a087c42fbf Mon Sep 17 00:00:00 2001 From: Sachin Beniwal Date: Wed, 3 Dec 2025 15:00:11 +0530 Subject: [PATCH 07/14] fix: improve error messages for invalid version formats with non-ASCII characters Signed-off-by: Sachin Beniwal --- src/dune_lang/dune_project.ml | 42 +++++++++++++- src/dune_sexp/atom.ml | 4 +- src/dune_sexp/lexer.mll | 2 +- src/dune_sexp/versioned_file.ml | 23 ++++++-- .../test-cases/extensions-invalid-version.t | 58 ++++++++++++------- .../test-cases/lang-invalid-version.t | 1 + 6 files changed, 98 insertions(+), 32 deletions(-) diff --git a/src/dune_lang/dune_project.ml b/src/dune_lang/dune_project.ml index e7653c646c4..3d6c04d7622 100644 --- a/src/dune_lang/dune_project.ml +++ b/src/dune_lang/dune_project.ml @@ -288,6 +288,42 @@ module Extension = struct () ;; + (* Decode an extension version with hints on error *) + let decode_version_with_hints extension_name : Syntax.Version.t Decoder.t = + let open Decoder in + raw + >>= fun sexp -> + match sexp with + | Atom (loc, A s) -> + (* Check if version has invalid format (non-ASCII or not X.Y pattern) *) + let has_invalid_format = + String.exists s ~f:(fun c -> Char.code c > 127) + || + match Scanf.sscanf s "%u.%u%s" (fun a b s -> (a, b), s) with + | Ok (_, "") -> false + | Ok (_, _) -> false (* has trailing chars, but base format is valid *) + | Error () -> true + in + if has_invalid_format + then ( + let hints = + match Table.find extensions extension_name with + | None -> [] + | Some (Extension (Packed e)) -> + (match Syntax.greatest_supported_version e.syntax with + | Some latest -> + [ Pp.textf "using %s %s" extension_name (Syntax.Version.to_string latest) ] + | None -> []) + | Some (Deleted_in _) -> [] + in + User_error.raise + ~loc + ~hints + [ Pp.text "Invalid version. Version must be two numbers separated by a dot." ]) + else Syntax.Version.decode + | _ -> Syntax.Version.decode + ;; + let instantiate ~dune_lang_ver ~loc ~parse_args (name_loc, name) (ver_loc, ver) = match Table.find extensions name with | None -> @@ -894,9 +930,9 @@ let parse ~dir ~(lang : Lang.Instance.t) ~file = and+ explicit_extensions = multi_field "using" - (let+ loc = loc - and+ name = located string - and+ ver = located Syntax.Version.decode + (let* name = located string in + let+ loc = loc + and+ ver = located (Extension.decode_version_with_hints (snd name)) and+ parse_args = capture in (* We don't parse the arguments quite yet as we want to set the version of extensions before parsing them. *) diff --git a/src/dune_sexp/atom.ml b/src/dune_sexp/atom.ml index aae42b69b7a..b6c6470076b 100644 --- a/src/dune_sexp/atom.ml +++ b/src/dune_sexp/atom.ml @@ -15,14 +15,14 @@ let is_valid = || match String.unsafe_get s i with | '%' -> after_percent s (i + 1) len - | '"' | '(' | ')' | ';' | '\000' .. '\032' | '\127' .. '\255' -> false + | '"' | '(' | ')' | ';' | '\000' .. '\032' | '\127' -> false | _ -> loop s (i + 1) len and after_percent s i len = i = len || match String.unsafe_get s i with | '%' -> after_percent s (i + 1) len - | '"' | '(' | ')' | ';' | '\000' .. '\032' | '\127' .. '\255' | '{' -> false + | '"' | '(' | ')' | ';' | '\000' .. '\032' | '\127' | '{' -> false | _ -> loop s (i + 1) len in fun s -> diff --git a/src/dune_sexp/lexer.mll b/src/dune_sexp/lexer.mll index 0bc4db1909e..32fd7a5716a 100644 --- a/src/dune_sexp/lexer.mll +++ b/src/dune_sexp/lexer.mll @@ -139,7 +139,7 @@ let blank = [' ' '\t' '\012'] let digit = ['0'-'9'] let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F'] -let atom_char = [^ ';' '(' ')' '"' '\000'-'\032' '\127'-'\255'] +let atom_char = [^ ';' '(' ')' '"' '\000'-'\032' '\127'] let varname_char = atom_char # [ ':' '%' '{' '}' ] rule token with_comments = parse diff --git a/src/dune_sexp/versioned_file.ml b/src/dune_sexp/versioned_file.ml index 29a1da42fd1..84f08f8a562 100644 --- a/src/dune_sexp/versioned_file.ml +++ b/src/dune_sexp/versioned_file.ml @@ -57,10 +57,18 @@ struct let parse first_line : Instance.t = let { First_line.lang = name_loc, name; version = ver_loc, ver } = first_line in + (* Check if version has invalid format (non-ASCII or not X.Y pattern) *) + let has_invalid_format = + String.exists ver ~f:(fun c -> Char.code c > 127) + || + match Scanf.sscanf ver "%u.%u%s" (fun a b s -> (a, b), s) with + | Ok (_, "") -> false + | Ok (_, _) -> false (* has trailing chars, but base format is valid *) + | Error () -> true + in let ver_atom = - match Atom.parse ver with - | Some atom -> atom - | None -> + if has_invalid_format + then ( let hints = match Table.find langs name with | None -> [] @@ -71,7 +79,14 @@ struct User_error.raise ~loc:ver_loc ~hints - [ Pp.text "Invalid version. Version must be two numbers separated by a dot." ] + [ Pp.text "Invalid version. Version must be two numbers separated by a dot." ]) + else ( + match Atom.parse ver with + | Some atom -> atom + | None -> + Code_error.raise + "Atom.parse failed for version that passed validation" + [ "version", String ver ]) in let dune_lang_ver = Decoder.parse Syntax.Version.decode Univ_map.empty (Atom (ver_loc, ver_atom)) diff --git a/test/blackbox-tests/test-cases/extensions-invalid-version.t b/test/blackbox-tests/test-cases/extensions-invalid-version.t index 56f6abaa257..c6c39ba75b3 100644 --- a/test/blackbox-tests/test-cases/extensions-invalid-version.t +++ b/test/blackbox-tests/test-cases/extensions-invalid-version.t @@ -16,6 +16,7 @@ Invalid version number: 2 | (using menhir Ali) ^^^ Error: Invalid version. Version must be two numbers separated by a dot. + Hint: using menhir 3.0 [1] Test with various non-ASCII characters: @@ -24,44 +25,49 @@ CR-someday benodiwal: The version_loc is greedy and captures the closing parenthesis. $ test_invalid_version "è" - File "dune-project", line 2, characters 14-14: + File "dune-project", line 2, characters 14-16: 2 | (using menhir è) - - Error: Invalid dune-project file + ^^ + Error: Invalid version. Version must be two numbers separated by a dot. + Hint: using menhir 3.0 [1] $ test_invalid_version "π3.14" - File "dune-project", line 2, characters 14-14: + File "dune-project", line 2, characters 14-20: 2 | (using menhir π3.14) - - Error: Invalid dune-project file + ^^^^^^ + Error: Invalid version. Version must be two numbers separated by a dot. + Hint: using menhir 3.0 [1] $ test_invalid_version "α" - File "dune-project", line 2, characters 14-14: + File "dune-project", line 2, characters 14-16: 2 | (using menhir α) - - Error: Invalid dune-project file + ^^ + Error: Invalid version. Version must be two numbers separated by a dot. + Hint: using menhir 3.0 [1] $ test_invalid_version "😀" - File "dune-project", line 2, characters 14-14: + File "dune-project", line 2, characters 14-18: 2 | (using menhir 😀) - - Error: Invalid dune-project file + ^^^^ + Error: Invalid version. Version must be two numbers separated by a dot. + Hint: using menhir 3.0 [1] CR-someday benodiwal: Unicode string lengths are miscomputed in location excerpts for East Asian characters. $ test_invalid_version "中3.16文" - File "dune-project", line 2, characters 14-14: + File "dune-project", line 2, characters 14-24: 2 | (using menhir 中3.16文) - - Error: Invalid dune-project file + ^^^^^^^^^^ + Error: Invalid version. Version must be two numbers separated by a dot. + Hint: using menhir 3.0 [1] Test with multiple extensions: @@ -72,6 +78,11 @@ Test with multiple extensions: > (using melange 0.1) > EOF $ dune build + File "dune-project", line 3, characters 0-19: + 3 | (using melange 0.1) + ^^^^^^^^^^^^^^^^^^^ + Error: Not enough arguments for "using" + [1] Multiple extensions with one invalid (ASCII): @@ -85,6 +96,7 @@ Multiple extensions with one invalid (ASCII): 3 | (using melange invalid) ^^^^^^^ Error: Invalid version. Version must be two numbers separated by a dot. + Hint: using melange 1.0 [1] Multiple extensions with one invalid (non-ASCII): @@ -95,10 +107,11 @@ Multiple extensions with one invalid (non-ASCII): > (using melange 😀) > EOF $ dune build - File "dune-project", line 3, characters 15-15: + File "dune-project", line 3, characters 15-19: 3 | (using melange 😀) - - Error: Invalid dune-project file + ^^^^ + Error: Invalid version. Version must be two numbers separated by a dot. + Hint: using melange 1.0 [1] Multiple extensions with first one invalid: @@ -109,10 +122,10 @@ Multiple extensions with first one invalid: > (using melange 0.1) > EOF $ dune build - File "dune-project", line 2, characters 14-17: - 2 | (using menhir bad) - ^^^ - Error: Invalid version. Version must be two numbers separated by a dot. + File "dune-project", line 3, characters 0-19: + 3 | (using melange 0.1) + ^^^^^^^^^^^^^^^^^^^ + Error: Not enough arguments for "using" [1] Multiple extensions both invalid: @@ -127,4 +140,5 @@ Multiple extensions both invalid: 3 | (using melange xyz) ^^^ Error: Invalid version. Version must be two numbers separated by a dot. + Hint: using melange 1.0 [1] diff --git a/test/blackbox-tests/test-cases/lang-invalid-version.t b/test/blackbox-tests/test-cases/lang-invalid-version.t index bc282f752b9..f5b7e9d4f76 100644 --- a/test/blackbox-tests/test-cases/lang-invalid-version.t +++ b/test/blackbox-tests/test-cases/lang-invalid-version.t @@ -15,6 +15,7 @@ Invalid version number: 1 | (lang dune Ali) ^^^ Error: Invalid version. Version must be two numbers separated by a dot. + Hint: lang dune 3.21 [1] Test with various non-ASCII characters: From 7e06d556fcb766809cb88b86fca6165ad018e17a Mon Sep 17 00:00:00 2001 From: Sachin Beniwal Date: Thu, 4 Dec 2025 16:38:04 +0530 Subject: [PATCH 08/14] feat: implemented lexer level approach for non ascii characters Signed-off-by: Sachin Beniwal --- src/dune_lang/dune_project.ml | 42 +----------- src/dune_sexp/atom.ml | 4 +- src/dune_sexp/lexer.mll | 4 +- src/dune_sexp/versioned_file.ml | 44 +++++-------- .../test-cases/extensions-invalid-version.t | 64 ++++++++----------- .../test-cases/lang-invalid-version.t | 21 +++--- 6 files changed, 62 insertions(+), 117 deletions(-) diff --git a/src/dune_lang/dune_project.ml b/src/dune_lang/dune_project.ml index 3d6c04d7622..e7653c646c4 100644 --- a/src/dune_lang/dune_project.ml +++ b/src/dune_lang/dune_project.ml @@ -288,42 +288,6 @@ module Extension = struct () ;; - (* Decode an extension version with hints on error *) - let decode_version_with_hints extension_name : Syntax.Version.t Decoder.t = - let open Decoder in - raw - >>= fun sexp -> - match sexp with - | Atom (loc, A s) -> - (* Check if version has invalid format (non-ASCII or not X.Y pattern) *) - let has_invalid_format = - String.exists s ~f:(fun c -> Char.code c > 127) - || - match Scanf.sscanf s "%u.%u%s" (fun a b s -> (a, b), s) with - | Ok (_, "") -> false - | Ok (_, _) -> false (* has trailing chars, but base format is valid *) - | Error () -> true - in - if has_invalid_format - then ( - let hints = - match Table.find extensions extension_name with - | None -> [] - | Some (Extension (Packed e)) -> - (match Syntax.greatest_supported_version e.syntax with - | Some latest -> - [ Pp.textf "using %s %s" extension_name (Syntax.Version.to_string latest) ] - | None -> []) - | Some (Deleted_in _) -> [] - in - User_error.raise - ~loc - ~hints - [ Pp.text "Invalid version. Version must be two numbers separated by a dot." ]) - else Syntax.Version.decode - | _ -> Syntax.Version.decode - ;; - let instantiate ~dune_lang_ver ~loc ~parse_args (name_loc, name) (ver_loc, ver) = match Table.find extensions name with | None -> @@ -930,9 +894,9 @@ let parse ~dir ~(lang : Lang.Instance.t) ~file = and+ explicit_extensions = multi_field "using" - (let* name = located string in - let+ loc = loc - and+ ver = located (Extension.decode_version_with_hints (snd name)) + (let+ loc = loc + and+ name = located string + and+ ver = located Syntax.Version.decode and+ parse_args = capture in (* We don't parse the arguments quite yet as we want to set the version of extensions before parsing them. *) diff --git a/src/dune_sexp/atom.ml b/src/dune_sexp/atom.ml index b6c6470076b..aae42b69b7a 100644 --- a/src/dune_sexp/atom.ml +++ b/src/dune_sexp/atom.ml @@ -15,14 +15,14 @@ let is_valid = || match String.unsafe_get s i with | '%' -> after_percent s (i + 1) len - | '"' | '(' | ')' | ';' | '\000' .. '\032' | '\127' -> false + | '"' | '(' | ')' | ';' | '\000' .. '\032' | '\127' .. '\255' -> false | _ -> loop s (i + 1) len and after_percent s i len = i = len || match String.unsafe_get s i with | '%' -> after_percent s (i + 1) len - | '"' | '(' | ')' | ';' | '\000' .. '\032' | '\127' | '{' -> false + | '"' | '(' | ')' | ';' | '\000' .. '\032' | '\127' .. '\255' | '{' -> false | _ -> loop s (i + 1) len in fun s -> diff --git a/src/dune_sexp/lexer.mll b/src/dune_sexp/lexer.mll index 32fd7a5716a..38912f02262 100644 --- a/src/dune_sexp/lexer.mll +++ b/src/dune_sexp/lexer.mll @@ -139,7 +139,7 @@ let blank = [' ' '\t' '\012'] let digit = ['0'-'9'] let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F'] -let atom_char = [^ ';' '(' ')' '"' '\000'-'\032' '\127'] +let atom_char = [^ ';' '(' ')' '"' '\000'-'\032' '\127'-'\255'] let varname_char = atom_char # [ ':' '%' '{' '}' ] rule token with_comments = parse @@ -182,6 +182,8 @@ and atom acc start = parse { atom ((template_variable lexbuf) :: acc) start lexbuf } | "%" { atom (Template.add_text acc "%") start lexbuf } + | ['\127'-'\255'] + { error lexbuf "Invalid atom: contains non-ASCII character(s). Atoms must only contain ASCII characters." } | "" { Template.token acc ~quoted:false ~start lexbuf } diff --git a/src/dune_sexp/versioned_file.ml b/src/dune_sexp/versioned_file.ml index 84f08f8a562..8d2ee837963 100644 --- a/src/dune_sexp/versioned_file.ml +++ b/src/dune_sexp/versioned_file.ml @@ -57,36 +57,24 @@ struct let parse first_line : Instance.t = let { First_line.lang = name_loc, name; version = ver_loc, ver } = first_line in - (* Check if version has invalid format (non-ASCII or not X.Y pattern) *) - let has_invalid_format = - String.exists ver ~f:(fun c -> Char.code c > 127) - || - match Scanf.sscanf ver "%u.%u%s" (fun a b s -> (a, b), s) with - | Ok (_, "") -> false - | Ok (_, _) -> false (* has trailing chars, but base format is valid *) - | Error () -> true - in let ver_atom = - if has_invalid_format - then ( - let hints = - match Table.find langs name with - | None -> [] - | Some lang -> - let latest = Syntax.greatest_supported_version_exn lang.syntax in - [ Pp.textf "lang dune %s" (Syntax.Version.to_string latest) ] - in - User_error.raise - ~loc:ver_loc - ~hints - [ Pp.text "Invalid version. Version must be two numbers separated by a dot." ]) - else ( - match Atom.parse ver with - | Some atom -> atom - | None -> + match Atom.parse ver with + | Some atom -> atom + | None -> + (* Atom.parse failed - likely due to non-ASCII characters *) + let has_non_ascii = String.exists ver ~f:(fun c -> Char.code c >= 128) in + if has_non_ascii + then + User_error.raise + ~loc:ver_loc + [ Pp.text + "Invalid atom: contains non-ASCII character(s). Atoms must only \ + contain ASCII characters." + ] + else Code_error.raise - "Atom.parse failed for version that passed validation" - [ "version", String ver ]) + "Atom.parse failed for unexpected reason" + [ "version", String ver ] in let dune_lang_ver = Decoder.parse Syntax.Version.decode Univ_map.empty (Atom (ver_loc, ver_atom)) diff --git a/test/blackbox-tests/test-cases/extensions-invalid-version.t b/test/blackbox-tests/test-cases/extensions-invalid-version.t index c6c39ba75b3..3f3d8257224 100644 --- a/test/blackbox-tests/test-cases/extensions-invalid-version.t +++ b/test/blackbox-tests/test-cases/extensions-invalid-version.t @@ -16,7 +16,6 @@ Invalid version number: 2 | (using menhir Ali) ^^^ Error: Invalid version. Version must be two numbers separated by a dot. - Hint: using menhir 3.0 [1] Test with various non-ASCII characters: @@ -25,49 +24,49 @@ CR-someday benodiwal: The version_loc is greedy and captures the closing parenthesis. $ test_invalid_version "è" - File "dune-project", line 2, characters 14-16: + File "dune-project", line 2, characters 14-15: 2 | (using menhir è) - ^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: using menhir 3.0 + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] $ test_invalid_version "π3.14" - File "dune-project", line 2, characters 14-20: + File "dune-project", line 2, characters 14-15: 2 | (using menhir π3.14) - ^^^^^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: using menhir 3.0 + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] $ test_invalid_version "α" - File "dune-project", line 2, characters 14-16: + File "dune-project", line 2, characters 14-15: 2 | (using menhir α) - ^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: using menhir 3.0 + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] $ test_invalid_version "😀" - File "dune-project", line 2, characters 14-18: + File "dune-project", line 2, characters 14-15: 2 | (using menhir 😀) - ^^^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: using menhir 3.0 + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] CR-someday benodiwal: Unicode string lengths are miscomputed in location excerpts for East Asian characters. $ test_invalid_version "中3.16文" - File "dune-project", line 2, characters 14-24: + File "dune-project", line 2, characters 14-15: 2 | (using menhir 中3.16文) - ^^^^^^^^^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: using menhir 3.0 + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] Test with multiple extensions: @@ -78,11 +77,6 @@ Test with multiple extensions: > (using melange 0.1) > EOF $ dune build - File "dune-project", line 3, characters 0-19: - 3 | (using melange 0.1) - ^^^^^^^^^^^^^^^^^^^ - Error: Not enough arguments for "using" - [1] Multiple extensions with one invalid (ASCII): @@ -96,7 +90,6 @@ Multiple extensions with one invalid (ASCII): 3 | (using melange invalid) ^^^^^^^ Error: Invalid version. Version must be two numbers separated by a dot. - Hint: using melange 1.0 [1] Multiple extensions with one invalid (non-ASCII): @@ -107,11 +100,11 @@ Multiple extensions with one invalid (non-ASCII): > (using melange 😀) > EOF $ dune build - File "dune-project", line 3, characters 15-19: + File "dune-project", line 3, characters 15-16: 3 | (using melange 😀) - ^^^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: using melange 1.0 + ^ + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] Multiple extensions with first one invalid: @@ -122,10 +115,10 @@ Multiple extensions with first one invalid: > (using melange 0.1) > EOF $ dune build - File "dune-project", line 3, characters 0-19: - 3 | (using melange 0.1) - ^^^^^^^^^^^^^^^^^^^ - Error: Not enough arguments for "using" + File "dune-project", line 2, characters 14-17: + 2 | (using menhir bad) + ^^^ + Error: Invalid version. Version must be two numbers separated by a dot. [1] Multiple extensions both invalid: @@ -140,5 +133,4 @@ Multiple extensions both invalid: 3 | (using melange xyz) ^^^ Error: Invalid version. Version must be two numbers separated by a dot. - Hint: using melange 1.0 [1] diff --git a/test/blackbox-tests/test-cases/lang-invalid-version.t b/test/blackbox-tests/test-cases/lang-invalid-version.t index f5b7e9d4f76..02062f8a4fd 100644 --- a/test/blackbox-tests/test-cases/lang-invalid-version.t +++ b/test/blackbox-tests/test-cases/lang-invalid-version.t @@ -15,7 +15,6 @@ Invalid version number: 1 | (lang dune Ali) ^^^ Error: Invalid version. Version must be two numbers separated by a dot. - Hint: lang dune 3.21 [1] Test with various non-ASCII characters: @@ -27,32 +26,32 @@ parenthesis. File "dune-project", line 1, characters 11-13: 1 | (lang dune è) ^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: lang dune 3.21 + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] $ test_invalid_version "π3.14" File "dune-project", line 1, characters 11-17: 1 | (lang dune π3.14) ^^^^^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: lang dune 3.21 + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] $ test_invalid_version "α" File "dune-project", line 1, characters 11-13: 1 | (lang dune α) ^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: lang dune 3.21 + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] $ test_invalid_version "😀" File "dune-project", line 1, characters 11-15: 1 | (lang dune 😀) ^^^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: lang dune 3.21 + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] CR-someday benodiwal: Unicode string lengths are miscomputed in location @@ -62,6 +61,6 @@ excerpts for East Asian characters. File "dune-project", line 1, characters 11-21: 1 | (lang dune 中3.16文) ^^^^^^^^^^ - Error: Invalid version. Version must be two numbers separated by a dot. - Hint: lang dune 3.21 + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] From b6487cea0ca6eac88e77a65e0d757c1998331e46 Mon Sep 17 00:00:00 2001 From: Sachin Beniwal Date: Thu, 4 Dec 2025 17:11:16 +0530 Subject: [PATCH 09/14] fix: refactor Signed-off-by: Sachin Beniwal --- src/dune_sexp/atom.ml | 5 +++++ src/dune_sexp/atom.mli | 4 ++++ src/dune_sexp/lexer.mll | 2 +- src/dune_sexp/versioned_file.ml | 14 ++------------ 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/dune_sexp/atom.ml b/src/dune_sexp/atom.ml index aae42b69b7a..8b2c45a9e59 100644 --- a/src/dune_sexp/atom.ml +++ b/src/dune_sexp/atom.ml @@ -9,6 +9,11 @@ let to_dyn (A s) = let equal (A a) (A b) = String.equal a b +let non_ascii_error_message = + "Invalid atom: contains non-ASCII character(s). Atoms must only contain ASCII \ + characters." +;; + let is_valid = let rec loop s i len = i = len diff --git a/src/dune_sexp/atom.mli b/src/dune_sexp/atom.mli index 982a4f224a6..19547746f0e 100644 --- a/src/dune_sexp/atom.mli +++ b/src/dune_sexp/atom.mli @@ -1,6 +1,10 @@ type t = private A of string [@@unboxed] val equal : t -> t -> bool + +(** Error message for atoms containing non-ASCII characters *) +val non_ascii_error_message : string + val is_valid : string -> bool val of_string : string -> t val to_string : t -> string diff --git a/src/dune_sexp/lexer.mll b/src/dune_sexp/lexer.mll index 38912f02262..55f18077520 100644 --- a/src/dune_sexp/lexer.mll +++ b/src/dune_sexp/lexer.mll @@ -183,7 +183,7 @@ and atom acc start = parse | "%" { atom (Template.add_text acc "%") start lexbuf } | ['\127'-'\255'] - { error lexbuf "Invalid atom: contains non-ASCII character(s). Atoms must only contain ASCII characters." } + { error lexbuf Atom.non_ascii_error_message } | "" { Template.token acc ~quoted:false ~start lexbuf } diff --git a/src/dune_sexp/versioned_file.ml b/src/dune_sexp/versioned_file.ml index 8d2ee837963..9d8af7349e5 100644 --- a/src/dune_sexp/versioned_file.ml +++ b/src/dune_sexp/versioned_file.ml @@ -61,20 +61,10 @@ struct match Atom.parse ver with | Some atom -> atom | None -> - (* Atom.parse failed - likely due to non-ASCII characters *) let has_non_ascii = String.exists ver ~f:(fun c -> Char.code c >= 128) in if has_non_ascii - then - User_error.raise - ~loc:ver_loc - [ Pp.text - "Invalid atom: contains non-ASCII character(s). Atoms must only \ - contain ASCII characters." - ] - else - Code_error.raise - "Atom.parse failed for unexpected reason" - [ "version", String ver ] + then User_error.raise ~loc:ver_loc [ Pp.text Atom.non_ascii_error_message ] + else User_error.raise ~loc:ver_loc [ Pp.textf "Invalid atom: %S" ver ] in let dune_lang_ver = Decoder.parse Syntax.Version.decode Univ_map.empty (Atom (ver_loc, ver_atom)) From 9a0b127c8aa524f2dfa0abf501ff5f9714d5cbaa Mon Sep 17 00:00:00 2001 From: Sachin Beniwal Date: Thu, 4 Dec 2025 17:14:31 +0530 Subject: [PATCH 10/14] tests: promoted non-ascii-characters.t to new logic Signed-off-by: Sachin Beniwal --- .../test-cases/formatting/non-ascii-characters.t | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t b/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t index 7b444ba7d75..b1dcaab44e2 100644 --- a/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t +++ b/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t @@ -25,8 +25,9 @@ Utf8 characters are handled for now, this is also related to the issue #9728 $ dune format-dune-file < (Écho "hello") > EOF - File "", line 1, characters 1-1: - Error: Invalid . file + File "", line 1, characters 1-2: + Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain + ASCII characters. [1] $ bash -c "printf '(echo \"%b\")' '\xc0'"| dune format-dune-file From 0864ca5d1496dd2a3227cb2b938cc1fa02f77e29 Mon Sep 17 00:00:00 2001 From: Sachin Beniwal Date: Fri, 5 Dec 2025 13:32:20 +0530 Subject: [PATCH 11/14] fix: else claude message Signed-off-by: Sachin Beniwal --- src/dune_sexp/versioned_file.ml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/dune_sexp/versioned_file.ml b/src/dune_sexp/versioned_file.ml index 9d8af7349e5..8505da59c27 100644 --- a/src/dune_sexp/versioned_file.ml +++ b/src/dune_sexp/versioned_file.ml @@ -64,7 +64,11 @@ struct let has_non_ascii = String.exists ver ~f:(fun c -> Char.code c >= 128) in if has_non_ascii then User_error.raise ~loc:ver_loc [ Pp.text Atom.non_ascii_error_message ] - else User_error.raise ~loc:ver_loc [ Pp.textf "Invalid atom: %S" ver ] + else + User_error.raise + ~loc:ver_loc + [ Pp.text "Invalid version. Version must be two numbers separated by a dot." + ] in let dune_lang_ver = Decoder.parse Syntax.Version.decode Univ_map.empty (Atom (ver_loc, ver_atom)) From cfae018f9dc1fa4dd0d4a8cc8c699f8600f4960d Mon Sep 17 00:00:00 2001 From: Sachin Beniwal Date: Fri, 5 Dec 2025 13:35:40 +0530 Subject: [PATCH 12/14] chore: added entry for CHANGES.md Signed-off-by: Sachin Beniwal --- doc/changes/fixed/12844.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 doc/changes/fixed/12844.md diff --git a/doc/changes/fixed/12844.md b/doc/changes/fixed/12844.md new file mode 100644 index 00000000000..056d09f16a7 --- /dev/null +++ b/doc/changes/fixed/12844.md @@ -0,0 +1,7 @@ +- Improve error messages for invalid version formats containing non-ASCII + characters. Previously, non-ASCII characters in version strings (e.g., `(lang + dune è)` or `(using menhir π3.14)`) would fail with a generic "Invalid file" + error. Now they display a clear message: "Invalid atom: contains non-ASCII + character(s). Atoms must only contain ASCII characters." The fix is + implemented at the lexer level, providing consistent error handling across all + s-expression parsing. (#12844, fixes #12836, @benodiwal) From 1a7bdbaae4a58a2bcb853c7a2e0690273eadafa9 Mon Sep 17 00:00:00 2001 From: Sachin Beniwal Date: Fri, 5 Dec 2025 16:05:46 +0530 Subject: [PATCH 13/14] chore: updated CR someday Signed-off-by: Sachin Beniwal --- .../test-cases/extensions-invalid-version.t | 11 +++++------ test/blackbox-tests/test-cases/lang-invalid-version.t | 5 +++++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/test/blackbox-tests/test-cases/extensions-invalid-version.t b/test/blackbox-tests/test-cases/extensions-invalid-version.t index 3f3d8257224..f7fff45a314 100644 --- a/test/blackbox-tests/test-cases/extensions-invalid-version.t +++ b/test/blackbox-tests/test-cases/extensions-invalid-version.t @@ -11,6 +11,11 @@ such situations provide a clear error. Invalid version number: +CR-someday benodiwal: Consider adding context-specific hints for ASCII invalid +versions (e.g., "Hint: using menhir 3.0"). The current lexer-level approach +trades hints for simplicity and robustness - it works uniformly across all +atoms without needing validation in every decoder. + $ test_invalid_version "Ali" File "dune-project", line 2, characters 14-17: 2 | (using menhir Ali) @@ -20,9 +25,6 @@ Invalid version number: Test with various non-ASCII characters: -CR-someday benodiwal: The version_loc is greedy and captures the closing -parenthesis. - $ test_invalid_version "è" File "dune-project", line 2, characters 14-15: 2 | (using menhir è) @@ -58,9 +60,6 @@ parenthesis. ASCII characters. [1] -CR-someday benodiwal: Unicode string lengths are miscomputed in location -excerpts for East Asian characters. - $ test_invalid_version "中3.16文" File "dune-project", line 2, characters 14-15: 2 | (using menhir 中3.16文) diff --git a/test/blackbox-tests/test-cases/lang-invalid-version.t b/test/blackbox-tests/test-cases/lang-invalid-version.t index 02062f8a4fd..d53a20c8e7c 100644 --- a/test/blackbox-tests/test-cases/lang-invalid-version.t +++ b/test/blackbox-tests/test-cases/lang-invalid-version.t @@ -10,6 +10,11 @@ such situations provide a clear error. Invalid version number: +CR-someday benodiwal: Consider adding context-specific hints for ASCII invalid +versions (e.g., "Hint: lang dune 3.21"). The current lexer-level approach +trades hints for simplicity and robustness - it works uniformly across all +atoms without needing validation in every decoder. + $ test_invalid_version "Ali" File "dune-project", line 1, characters 11-14: 1 | (lang dune Ali) From 344ccb45306c846b44245585c47d8f3a094e2f30 Mon Sep 17 00:00:00 2001 From: Sachin Beniwal Date: Fri, 5 Dec 2025 18:20:42 +0530 Subject: [PATCH 14/14] fix: ascii start Signed-off-by: Sachin Beniwal --- src/dune_sexp/lexer.mll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dune_sexp/lexer.mll b/src/dune_sexp/lexer.mll index 55f18077520..d2c91ea6088 100644 --- a/src/dune_sexp/lexer.mll +++ b/src/dune_sexp/lexer.mll @@ -182,7 +182,7 @@ and atom acc start = parse { atom ((template_variable lexbuf) :: acc) start lexbuf } | "%" { atom (Template.add_text acc "%") start lexbuf } - | ['\127'-'\255'] + | ['\128'-'\255'] { error lexbuf Atom.non_ascii_error_message } | "" { Template.token acc ~quoted:false ~start lexbuf }