From 7241d8fdc3bd08f93144ce415704eed4456875db Mon Sep 17 00:00:00 2001 From: iOptimizeThings Date: Wed, 20 May 2026 15:38:23 -0700 Subject: [PATCH 1/3] json-schema-to-grammar: expand PCRE shorthands (\d \w \s \b) to GBNF equivalents --- common/json-schema-to-grammar.cpp | 32 ++++- tests/test-json-schema-to-grammar.cpp | 182 ++++++++++++++++++++++++++ 2 files changed, 212 insertions(+), 2 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index e2c4d6ce22e..9e534f85967 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -448,8 +448,22 @@ class common_schema_converter { i++; while (i < length && sub_pattern[i] != ']') { if (sub_pattern[i] == '\\') { - square_brackets += sub_pattern.substr(i, 2); - i += 2; + char next = sub_pattern[i + 1]; + if (next == 'd' || next == 'D') { + square_brackets += next == 'd' ? "0-9" : "^0-9"; + i += 2; + } else if (next == 'w' || next == 'W') { + square_brackets += next == 'w' ? "a-zA-Z0-9_" : "^a-zA-Z0-9_"; + i += 2; + } else if (next == 's' || next == 'S') { + square_brackets += next == 's' ? " \\t\\n\\r" : "^ \\t\\n\\r"; + i += 2; + } else if (next == 'b' || next == 'B') { + i += 2; // word boundary - no GBNF equivalent, skip + } else { + square_brackets += sub_pattern.substr(i, 2); + i += 2; + } } else { square_brackets += sub_pattern[i]; i++; @@ -529,6 +543,20 @@ class common_schema_converter { i++; literal += sub_pattern[i]; i++; + } else if (next == 'd' || next == 'D') { + if (!literal.empty()) { seq.emplace_back(literal, true); literal = ""; } + seq.emplace_back(next == 'd' ? "[0-9]" : "[^0-9]", false); + i += 2; + } else if (next == 'w' || next == 'W') { + if (!literal.empty()) { seq.emplace_back(literal, true); literal = ""; } + seq.emplace_back(next == 'w' ? "[a-zA-Z0-9_]" : "[^a-zA-Z0-9_]", false); + i += 2; + } else if (next == 's' || next == 'S') { + if (!literal.empty()) { seq.emplace_back(literal, true); literal = ""; } + seq.emplace_back(next == 's' ? "[ \\t\\n\\r]" : "[^ \\t\\n\\r]", false); + i += 2; + } else if (next == 'b' || next == 'B') { + i += 2; // word boundary - no GBNF equivalent, skip } else { literal += sub_pattern.substr(i, 2); i += 2; diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index b4362852c39..9d49e8d8030 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -1564,6 +1564,188 @@ int main() { space ::= | " " | "\n"{1,2} [ \t]{0,20} )""", }); + + run({ + SUCCESS, + R"(regexp \d shorthand)", + R"""({ + "type": "string", + "pattern": "^\\d+$" + })""", + R"""( + root ::= "\"" ([0-9]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp [\d] class)", + R"""({ + "type": "string", + "pattern": "^[\\d]+$" + })""", + R"""( + root ::= "\"" ([0-9]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp \w shorthand)", + R"""({ + "type": "string", + "pattern": "^\\w+$" + })""", + R"""( + root ::= "\"" ([a-zA-Z0-9_]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp [\w] class)", + R"""({ + "type": "string", + "pattern": "^[\\w]+$" + })""", + R"""( + root ::= "\"" ([a-zA-Z0-9_]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp \s shorthand)", + R"""({ + "type": "string", + "pattern": "^\\s+$" + })""", + R"""( + root ::= "\"" ([ \t\n\r]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp [\s] class)", + R"""({ + "type": "string", + "pattern": "^[\\s]+$" + })""", + R"""( + root ::= "\"" ([ \t\n\r]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp \b boundary skipped)", + R"""({ + "type": "string", + "pattern": "^\\bfoo\\b$" + })""", + R"""( + root ::= "\"" ("foo") "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp \D shorthand)", + R"""({ + "type": "string", + "pattern": "^\\D+$" + })""", + R"""( + root ::= "\"" ([^0-9]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp [\D] class)", + R"""({ + "type": "string", + "pattern": "^[\\D]+$" + })""", + R"""( + root ::= "\"" ([^0-9]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp \W shorthand)", + R"""({ + "type": "string", + "pattern": "^\\W+$" + })""", + R"""( + root ::= "\"" ([^a-zA-Z0-9_]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp [\W] class)", + R"""({ + "type": "string", + "pattern": "^[\\W]+$" + })""", + R"""( + root ::= "\"" ([^a-zA-Z0-9_]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp \S shorthand)", + R"""({ + "type": "string", + "pattern": "^\\S+$" + })""", + R"""( + root ::= "\"" ([^ \t\n\r]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp [\S] class)", + R"""({ + "type": "string", + "pattern": "^[\\S]+$" + })""", + R"""( + root ::= "\"" ([^ \t\n\r]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp \B boundary skipped)", + R"""({ + "type": "string", + "pattern": "^\\Bfoo\\B$" + })""", + R"""( + root ::= "\"" ("foo") "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); } if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) { From 39e4553f0a22150da35c4ba4d0bcd5b5156175f7 Mon Sep 17 00:00:00 2001 From: iOptimizeThings Date: Thu, 21 May 2026 15:59:20 -0700 Subject: [PATCH 2/3] json-schema-to-grammar: fix missing bounds check before reading next char --- common/json-schema-to-grammar.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 9e534f85967..26d7c180e35 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -447,7 +447,7 @@ class common_schema_converter { std::string square_brackets = std::string(1, c); i++; while (i < length && sub_pattern[i] != ']') { - if (sub_pattern[i] == '\\') { + if (sub_pattern[i] == '\\' && i + 1 < length) { char next = sub_pattern[i + 1]; if (next == 'd' || next == 'D') { square_brackets += next == 'd' ? "0-9" : "^0-9"; From 86357c0ae036b11b494412684ba8d4fa4a2ebebf Mon Sep 17 00:00:00 2001 From: iOptimizeThings Date: Thu, 21 May 2026 21:57:38 -0700 Subject: [PATCH 3/3] json-schema-to-grammar: fix negated shorthands (\D \W \S) in bracket classes --- common/json-schema-to-grammar.cpp | 71 ++++++++++++++++++++------- tests/test-json-schema-to-grammar.cpp | 52 ++++++++++++++++++++ 2 files changed, 104 insertions(+), 19 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 26d7c180e35..863b7fc4287 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -444,37 +444,70 @@ class common_schema_converter { } return join_seq(); } else if (c == '[') { - std::string square_brackets = std::string(1, c); i++; + bool outer_neg = (i < length && sub_pattern[i] == '^'); + if (outer_neg) i++; + + // Collect positive content (literals + positive shorthands) and + // negated shorthands separately. Negated shorthands can't be inlined + // into a single bracket class alongside other content because the ^ + // only means negation at position 0 of a class. + std::string literals; + std::vector neg_parts; + while (i < length && sub_pattern[i] != ']') { if (sub_pattern[i] == '\\' && i + 1 < length) { char next = sub_pattern[i + 1]; - if (next == 'd' || next == 'D') { - square_brackets += next == 'd' ? "0-9" : "^0-9"; - i += 2; - } else if (next == 'w' || next == 'W') { - square_brackets += next == 'w' ? "a-zA-Z0-9_" : "^a-zA-Z0-9_"; - i += 2; - } else if (next == 's' || next == 'S') { - square_brackets += next == 's' ? " \\t\\n\\r" : "^ \\t\\n\\r"; - i += 2; - } else if (next == 'b' || next == 'B') { - i += 2; // word boundary - no GBNF equivalent, skip - } else { - square_brackets += sub_pattern.substr(i, 2); - i += 2; - } + if (next == 'd') { literals += "0-9"; i += 2; } + else if (next == 'D') { neg_parts.push_back("0-9"); i += 2; } + else if (next == 'w') { literals += "a-zA-Z0-9_"; i += 2; } + else if (next == 'W') { neg_parts.push_back("a-zA-Z0-9_"); i += 2; } + else if (next == 's') { literals += " \\t\\n\\r"; i += 2; } + else if (next == 'S') { neg_parts.push_back(" \\t\\n\\r"); i += 2; } + else if (next == 'b' || next == 'B') { i += 2; } // no GBNF equivalent, skip + else { literals += sub_pattern.substr(i, 2); i += 2; } } else { - square_brackets += sub_pattern[i]; + literals += sub_pattern[i]; i++; } } if (i >= length) { _errors.push_back("Unbalanced square brackets"); } - square_brackets += ']'; i++; - seq.emplace_back(square_brackets, false); + + if (neg_parts.empty()) { + // No negated shorthands: emit a single bracket class. + seq.emplace_back((outer_neg ? "[^" : "[") + literals + "]", false); + } else if (!outer_neg) { + // Mix of positive and negated content: emit alternation. + // e.g. [\d\W] -> ([0-9] | [^a-zA-Z0-9_]) + // e.g. [\s\S] -> ([ \t\n\r] | [^ \t\n\r]) + std::vector parts; + if (!literals.empty()) { + parts.push_back("[" + literals + "]"); + } + for (const std::string & neg : neg_parts) { + parts.push_back("[^" + neg + "]"); + } + if (parts.size() == 1) { + seq.emplace_back(parts[0], false); + } else { + std::string alt = "(" + parts[0]; + for (size_t pi = 1; pi < parts.size(); pi++) { + alt += " | " + parts[pi]; + } + seq.emplace_back(alt + ")", false); + } + } else { + // Outer-negated class with negated shorthands, e.g. [^\D\w]. + // Concatenate all expansions under the outer ^ (best effort). + std::string combined = literals; + for (const std::string & neg : neg_parts) { + combined += neg; + } + seq.emplace_back("[^" + combined + "]", false); + } } else if (c == '|') { seq.emplace_back("|", false); i++; diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 9d49e8d8030..5a61dbf4918 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -1734,6 +1734,58 @@ int main() { )""", }); + run({ + SUCCESS, + R"(regexp [\d\W] mixed pos-and-neg class)", + R"""({ + "type": "string", + "pattern": "^[\\d\\W]+$" + })""", + R"""( + root ::= "\"" (([0-9] | [^a-zA-Z0-9_])+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp [\s\S] any-char class)", + R"""({ + "type": "string", + "pattern": "^[\\s\\S]+$" + })""", + R"""( + root ::= "\"" (([ \t\n\r] | [^ \t\n\r])+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp [a-z\D] literal-and-neg class)", + R"""({ + "type": "string", + "pattern": "^[a-z\\D]+$" + })""", + R"""( + root ::= "\"" (([a-z] | [^0-9])+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp [\d\w\D] multi-shorthand mixed class)", + R"""({ + "type": "string", + "pattern": "^[\\d\\w\\D]+$" + })""", + R"""( + root ::= "\"" (([0-9a-zA-Z0-9_] | [^0-9])+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + run({ SUCCESS, R"(regexp \B boundary skipped)",