diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index e2c4d6ce22e..863b7fc4287 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -444,23 +444,70 @@ class common_schema_converter { } return join_seq(); } else if (c == '[') { - std::string square_brackets = std::string(1, c); i++; + bool outer_neg = (i < length && sub_pattern[i] == '^'); + if (outer_neg) i++; + + // Collect positive content (literals + positive shorthands) and + // negated shorthands separately. Negated shorthands can't be inlined + // into a single bracket class alongside other content because the ^ + // only means negation at position 0 of a class. + std::string literals; + std::vector neg_parts; + while (i < length && sub_pattern[i] != ']') { - if (sub_pattern[i] == '\\') { - square_brackets += sub_pattern.substr(i, 2); - i += 2; + if (sub_pattern[i] == '\\' && i + 1 < length) { + char next = sub_pattern[i + 1]; + if (next == 'd') { literals += "0-9"; i += 2; } + else if (next == 'D') { neg_parts.push_back("0-9"); i += 2; } + else if (next == 'w') { literals += "a-zA-Z0-9_"; i += 2; } + else if (next == 'W') { neg_parts.push_back("a-zA-Z0-9_"); i += 2; } + else if (next == 's') { literals += " \\t\\n\\r"; i += 2; } + else if (next == 'S') { neg_parts.push_back(" \\t\\n\\r"); i += 2; } + else if (next == 'b' || next == 'B') { i += 2; } // no GBNF equivalent, skip + else { literals += sub_pattern.substr(i, 2); i += 2; } } else { - square_brackets += sub_pattern[i]; + literals += sub_pattern[i]; i++; } } if (i >= length) { _errors.push_back("Unbalanced square brackets"); } - square_brackets += ']'; i++; - seq.emplace_back(square_brackets, false); + + if (neg_parts.empty()) { + // No negated shorthands: emit a single bracket class. + seq.emplace_back((outer_neg ? "[^" : "[") + literals + "]", false); + } else if (!outer_neg) { + // Mix of positive and negated content: emit alternation. + // e.g. [\d\W] -> ([0-9] | [^a-zA-Z0-9_]) + // e.g. [\s\S] -> ([ \t\n\r] | [^ \t\n\r]) + std::vector parts; + if (!literals.empty()) { + parts.push_back("[" + literals + "]"); + } + for (const std::string & neg : neg_parts) { + parts.push_back("[^" + neg + "]"); + } + if (parts.size() == 1) { + seq.emplace_back(parts[0], false); + } else { + std::string alt = "(" + parts[0]; + for (size_t pi = 1; pi < parts.size(); pi++) { + alt += " | " + parts[pi]; + } + seq.emplace_back(alt + ")", false); + } + } else { + // Outer-negated class with negated shorthands, e.g. [^\D\w]. + // Concatenate all expansions under the outer ^ (best effort). + std::string combined = literals; + for (const std::string & neg : neg_parts) { + combined += neg; + } + seq.emplace_back("[^" + combined + "]", false); + } } else if (c == '|') { seq.emplace_back("|", false); i++; @@ -529,6 +576,20 @@ class common_schema_converter { i++; literal += sub_pattern[i]; i++; + } else if (next == 'd' || next == 'D') { + if (!literal.empty()) { seq.emplace_back(literal, true); literal = ""; } + seq.emplace_back(next == 'd' ? "[0-9]" : "[^0-9]", false); + i += 2; + } else if (next == 'w' || next == 'W') { + if (!literal.empty()) { seq.emplace_back(literal, true); literal = ""; } + seq.emplace_back(next == 'w' ? "[a-zA-Z0-9_]" : "[^a-zA-Z0-9_]", false); + i += 2; + } else if (next == 's' || next == 'S') { + if (!literal.empty()) { seq.emplace_back(literal, true); literal = ""; } + seq.emplace_back(next == 's' ? "[ \\t\\n\\r]" : "[^ \\t\\n\\r]", false); + i += 2; + } else if (next == 'b' || next == 'B') { + i += 2; // word boundary - no GBNF equivalent, skip } else { literal += sub_pattern.substr(i, 2); i += 2; diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index b4362852c39..5a61dbf4918 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -1564,6 +1564,240 @@ int main() { space ::= | " " | "\n"{1,2} [ \t]{0,20} )""", }); + + run({ + SUCCESS, + R"(regexp \d shorthand)", + R"""({ + "type": "string", + "pattern": "^\\d+$" + })""", + R"""( + root ::= "\"" ([0-9]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp [\d] class)", + R"""({ + "type": "string", + "pattern": "^[\\d]+$" + })""", + R"""( + root ::= "\"" ([0-9]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp \w shorthand)", + R"""({ + "type": "string", + "pattern": "^\\w+$" + })""", + R"""( + root ::= "\"" ([a-zA-Z0-9_]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp [\w] class)", + R"""({ + "type": "string", + "pattern": "^[\\w]+$" + })""", + R"""( + root ::= "\"" ([a-zA-Z0-9_]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp \s shorthand)", + R"""({ + "type": "string", + "pattern": "^\\s+$" + })""", + R"""( + root ::= "\"" ([ \t\n\r]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp [\s] class)", + R"""({ + "type": "string", + "pattern": "^[\\s]+$" + })""", + R"""( + root ::= "\"" ([ \t\n\r]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp \b boundary skipped)", + R"""({ + "type": "string", + "pattern": "^\\bfoo\\b$" + })""", + R"""( + root ::= "\"" ("foo") "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp \D shorthand)", + R"""({ + "type": "string", + "pattern": "^\\D+$" + })""", + R"""( + root ::= "\"" ([^0-9]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp [\D] class)", + R"""({ + "type": "string", + "pattern": "^[\\D]+$" + })""", + R"""( + root ::= "\"" ([^0-9]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp \W shorthand)", + R"""({ + "type": "string", + "pattern": "^\\W+$" + })""", + R"""( + root ::= "\"" ([^a-zA-Z0-9_]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp [\W] class)", + R"""({ + "type": "string", + "pattern": "^[\\W]+$" + })""", + R"""( + root ::= "\"" ([^a-zA-Z0-9_]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp \S shorthand)", + R"""({ + "type": "string", + "pattern": "^\\S+$" + })""", + R"""( + root ::= "\"" ([^ \t\n\r]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp [\S] class)", + R"""({ + "type": "string", + "pattern": "^[\\S]+$" + })""", + R"""( + root ::= "\"" ([^ \t\n\r]+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp [\d\W] mixed pos-and-neg class)", + R"""({ + "type": "string", + "pattern": "^[\\d\\W]+$" + })""", + R"""( + root ::= "\"" (([0-9] | [^a-zA-Z0-9_])+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp [\s\S] any-char class)", + R"""({ + "type": "string", + "pattern": "^[\\s\\S]+$" + })""", + R"""( + root ::= "\"" (([ \t\n\r] | [^ \t\n\r])+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp [a-z\D] literal-and-neg class)", + R"""({ + "type": "string", + "pattern": "^[a-z\\D]+$" + })""", + R"""( + root ::= "\"" (([a-z] | [^0-9])+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp [\d\w\D] multi-shorthand mixed class)", + R"""({ + "type": "string", + "pattern": "^[\\d\\w\\D]+$" + })""", + R"""( + root ::= "\"" (([0-9a-zA-Z0-9_] | [^0-9])+) "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + R"(regexp \B boundary skipped)", + R"""({ + "type": "string", + "pattern": "^\\Bfoo\\B$" + })""", + R"""( + root ::= "\"" ("foo") "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); } if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) {