From 7241d8fdc3bd08f93144ce415704eed4456875db Mon Sep 17 00:00:00 2001
From: iOptimizeThings <aoun.maz@gmail.com>
Date: Wed, 20 May 2026 15:38:23 -0700
Subject: [PATCH 1/3] json-schema-to-grammar: expand PCRE shorthands (\d \w \s
 \b) to GBNF equivalents

---
 common/json-schema-to-grammar.cpp     |  32 ++++-
 tests/test-json-schema-to-grammar.cpp | 182 ++++++++++++++++++++++++++
 2 files changed, 212 insertions(+), 2 deletions(-)

diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index e2c4d6ce22e..9e534f85967 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -448,8 +448,22 @@ class common_schema_converter {
                     i++;
                     while (i < length && sub_pattern[i] != ']') {
                         if (sub_pattern[i] == '\\') {
-                            square_brackets += sub_pattern.substr(i, 2);
-                            i += 2;
+                            char next = sub_pattern[i + 1];
+                            if (next == 'd' || next == 'D') {
+                                square_brackets += next == 'd' ? "0-9" : "^0-9";
+                                i += 2;
+                            } else if (next == 'w' || next == 'W') {
+                                square_brackets += next == 'w' ? "a-zA-Z0-9_" : "^a-zA-Z0-9_";
+                                i += 2;
+                            } else if (next == 's' || next == 'S') {
+                                square_brackets += next == 's' ? " \\t\\n\\r" : "^ \\t\\n\\r";
+                                i += 2;
+                            } else if (next == 'b' || next == 'B') {
+                                i += 2;  // word boundary - no GBNF equivalent, skip
+                            } else {
+                                square_brackets += sub_pattern.substr(i, 2);
+                                i += 2;
+                            }
                         } else {
                             square_brackets += sub_pattern[i];
                             i++;
@@ -529,6 +543,20 @@ class common_schema_converter {
                                 i++;
                                 literal += sub_pattern[i];
                                 i++;
+                            } else if (next == 'd' || next == 'D') {
+                                if (!literal.empty()) { seq.emplace_back(literal, true); literal = ""; }
+                                seq.emplace_back(next == 'd' ? "[0-9]" : "[^0-9]", false);
+                                i += 2;
+                            } else if (next == 'w' || next == 'W') {
+                                if (!literal.empty()) { seq.emplace_back(literal, true); literal = ""; }
+                                seq.emplace_back(next == 'w' ? "[a-zA-Z0-9_]" : "[^a-zA-Z0-9_]", false);
+                                i += 2;
+                            } else if (next == 's' || next == 'S') {
+                                if (!literal.empty()) { seq.emplace_back(literal, true); literal = ""; }
+                                seq.emplace_back(next == 's' ? "[ \\t\\n\\r]" : "[^ \\t\\n\\r]", false);
+                                i += 2;
+                            } else if (next == 'b' || next == 'B') {
+                                i += 2;  // word boundary - no GBNF equivalent, skip
                             } else {
                                 literal += sub_pattern.substr(i, 2);
                                 i += 2;
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
index b4362852c39..9d49e8d8030 100755
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -1564,6 +1564,188 @@ int main() {
                 space ::= | " " | "\n"{1,2} [ \t]{0,20}
             )""",
         });
+
+        run({
+            SUCCESS,
+            R"(regexp \d shorthand)",
+            R"""({
+                "type": "string",
+                "pattern": "^\\d+$"
+            })""",
+            R"""(
+                root ::= "\"" ([0-9]+) "\"" space
+                space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            )""",
+        });
+
+        run({
+            SUCCESS,
+            R"(regexp [\d] class)",
+            R"""({
+                "type": "string",
+                "pattern": "^[\\d]+$"
+            })""",
+            R"""(
+                root ::= "\"" ([0-9]+) "\"" space
+                space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            )""",
+        });
+
+        run({
+            SUCCESS,
+            R"(regexp \w shorthand)",
+            R"""({
+                "type": "string",
+                "pattern": "^\\w+$"
+            })""",
+            R"""(
+                root ::= "\"" ([a-zA-Z0-9_]+) "\"" space
+                space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            )""",
+        });
+
+        run({
+            SUCCESS,
+            R"(regexp [\w] class)",
+            R"""({
+                "type": "string",
+                "pattern": "^[\\w]+$"
+            })""",
+            R"""(
+                root ::= "\"" ([a-zA-Z0-9_]+) "\"" space
+                space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            )""",
+        });
+
+        run({
+            SUCCESS,
+            R"(regexp \s shorthand)",
+            R"""({
+                "type": "string",
+                "pattern": "^\\s+$"
+            })""",
+            R"""(
+                root ::= "\"" ([ \t\n\r]+) "\"" space
+                space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            )""",
+        });
+
+        run({
+            SUCCESS,
+            R"(regexp [\s] class)",
+            R"""({
+                "type": "string",
+                "pattern": "^[\\s]+$"
+            })""",
+            R"""(
+                root ::= "\"" ([ \t\n\r]+) "\"" space
+                space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            )""",
+        });
+
+        run({
+            SUCCESS,
+            R"(regexp \b boundary skipped)",
+            R"""({
+                "type": "string",
+                "pattern": "^\\bfoo\\b$"
+            })""",
+            R"""(
+                root ::= "\"" ("foo") "\"" space
+                space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            )""",
+        });
+
+        run({
+            SUCCESS,
+            R"(regexp \D shorthand)",
+            R"""({
+                "type": "string",
+                "pattern": "^\\D+$"
+            })""",
+            R"""(
+                root ::= "\"" ([^0-9]+) "\"" space
+                space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            )""",
+        });
+
+        run({
+            SUCCESS,
+            R"(regexp [\D] class)",
+            R"""({
+                "type": "string",
+                "pattern": "^[\\D]+$"
+            })""",
+            R"""(
+                root ::= "\"" ([^0-9]+) "\"" space
+                space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            )""",
+        });
+
+        run({
+            SUCCESS,
+            R"(regexp \W shorthand)",
+            R"""({
+                "type": "string",
+                "pattern": "^\\W+$"
+            })""",
+            R"""(
+                root ::= "\"" ([^a-zA-Z0-9_]+) "\"" space
+                space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            )""",
+        });
+
+        run({
+            SUCCESS,
+            R"(regexp [\W] class)",
+            R"""({
+                "type": "string",
+                "pattern": "^[\\W]+$"
+            })""",
+            R"""(
+                root ::= "\"" ([^a-zA-Z0-9_]+) "\"" space
+                space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            )""",
+        });
+
+        run({
+            SUCCESS,
+            R"(regexp \S shorthand)",
+            R"""({
+                "type": "string",
+                "pattern": "^\\S+$"
+            })""",
+            R"""(
+                root ::= "\"" ([^ \t\n\r]+) "\"" space
+                space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            )""",
+        });
+
+        run({
+            SUCCESS,
+            R"(regexp [\S] class)",
+            R"""({
+                "type": "string",
+                "pattern": "^[\\S]+$"
+            })""",
+            R"""(
+                root ::= "\"" ([^ \t\n\r]+) "\"" space
+                space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            )""",
+        });
+
+        run({
+            SUCCESS,
+            R"(regexp \B boundary skipped)",
+            R"""({
+                "type": "string",
+                "pattern": "^\\Bfoo\\B$"
+            })""",
+            R"""(
+                root ::= "\"" ("foo") "\"" space
+                space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            )""",
+        });
     }
 
     if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) {

From 39e4553f0a22150da35c4ba4d0bcd5b5156175f7 Mon Sep 17 00:00:00 2001
From: iOptimizeThings <aoun.maz@gmail.com>
Date: Thu, 21 May 2026 15:59:20 -0700
Subject: [PATCH 2/3] json-schema-to-grammar: fix missing bounds check before
 reading next char

---
 common/json-schema-to-grammar.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 9e534f85967..26d7c180e35 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -447,7 +447,7 @@ class common_schema_converter {
                     std::string square_brackets = std::string(1, c);
                     i++;
                     while (i < length && sub_pattern[i] != ']') {
-                        if (sub_pattern[i] == '\\') {
+                        if (sub_pattern[i] == '\\' && i + 1 < length) {
                             char next = sub_pattern[i + 1];
                             if (next == 'd' || next == 'D') {
                                 square_brackets += next == 'd' ? "0-9" : "^0-9";

From 86357c0ae036b11b494412684ba8d4fa4a2ebebf Mon Sep 17 00:00:00 2001
From: iOptimizeThings <aoun.maz@gmail.com>
Date: Thu, 21 May 2026 21:57:38 -0700
Subject: [PATCH 3/3] json-schema-to-grammar: fix negated shorthands (\D \W \S)
 in bracket classes

---
 common/json-schema-to-grammar.cpp     | 71 ++++++++++++++++++++-------
 tests/test-json-schema-to-grammar.cpp | 52 ++++++++++++++++++++
 2 files changed, 104 insertions(+), 19 deletions(-)

diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 26d7c180e35..863b7fc4287 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -444,37 +444,70 @@ class common_schema_converter {
                     }
                     return join_seq();
                 } else if (c == '[') {
-                    std::string square_brackets = std::string(1, c);
                     i++;
+                    bool outer_neg = (i < length && sub_pattern[i] == '^');
+                    if (outer_neg) i++;
+
+                    // Collect positive content (literals + positive shorthands) and
+                    // negated shorthands separately. Negated shorthands can't be inlined
+                    // into a single bracket class alongside other content because the ^
+                    // only means negation at position 0 of a class.
+                    std::string literals;
+                    std::vector<std::string> neg_parts;
+
                     while (i < length && sub_pattern[i] != ']') {
                         if (sub_pattern[i] == '\\' && i + 1 < length) {
                             char next = sub_pattern[i + 1];
-                            if (next == 'd' || next == 'D') {
-                                square_brackets += next == 'd' ? "0-9" : "^0-9";
-                                i += 2;
-                            } else if (next == 'w' || next == 'W') {
-                                square_brackets += next == 'w' ? "a-zA-Z0-9_" : "^a-zA-Z0-9_";
-                                i += 2;
-                            } else if (next == 's' || next == 'S') {
-                                square_brackets += next == 's' ? " \\t\\n\\r" : "^ \\t\\n\\r";
-                                i += 2;
-                            } else if (next == 'b' || next == 'B') {
-                                i += 2;  // word boundary - no GBNF equivalent, skip
-                            } else {
-                                square_brackets += sub_pattern.substr(i, 2);
-                                i += 2;
-                            }
+                            if (next == 'd') { literals += "0-9"; i += 2; }
+                            else if (next == 'D') { neg_parts.push_back("0-9"); i += 2; }
+                            else if (next == 'w') { literals += "a-zA-Z0-9_"; i += 2; }
+                            else if (next == 'W') { neg_parts.push_back("a-zA-Z0-9_"); i += 2; }
+                            else if (next == 's') { literals += " \\t\\n\\r"; i += 2; }
+                            else if (next == 'S') { neg_parts.push_back(" \\t\\n\\r"); i += 2; }
+                            else if (next == 'b' || next == 'B') { i += 2; }  // no GBNF equivalent, skip
+                            else { literals += sub_pattern.substr(i, 2); i += 2; }
                         } else {
-                            square_brackets += sub_pattern[i];
+                            literals += sub_pattern[i];
                             i++;
                         }
                     }
                     if (i >= length) {
                         _errors.push_back("Unbalanced square brackets");
                     }
-                    square_brackets += ']';
                     i++;
-                    seq.emplace_back(square_brackets, false);
+
+                    if (neg_parts.empty()) {
+                        // No negated shorthands: emit a single bracket class.
+                        seq.emplace_back((outer_neg ? "[^" : "[") + literals + "]", false);
+                    } else if (!outer_neg) {
+                        // Mix of positive and negated content: emit alternation.
+                        // e.g. [\d\W] -> ([0-9] | [^a-zA-Z0-9_])
+                        // e.g. [\s\S] -> ([ \t\n\r] | [^ \t\n\r])
+                        std::vector<std::string> parts;
+                        if (!literals.empty()) {
+                            parts.push_back("[" + literals + "]");
+                        }
+                        for (const std::string & neg : neg_parts) {
+                            parts.push_back("[^" + neg + "]");
+                        }
+                        if (parts.size() == 1) {
+                            seq.emplace_back(parts[0], false);
+                        } else {
+                            std::string alt = "(" + parts[0];
+                            for (size_t pi = 1; pi < parts.size(); pi++) {
+                                alt += " | " + parts[pi];
+                            }
+                            seq.emplace_back(alt + ")", false);
+                        }
+                    } else {
+                        // Outer-negated class with negated shorthands, e.g. [^\D\w].
+                        // Concatenate all expansions under the outer ^ (best effort).
+                        std::string combined = literals;
+                        for (const std::string & neg : neg_parts) {
+                            combined += neg;
+                        }
+                        seq.emplace_back("[^" + combined + "]", false);
+                    }
                 } else if (c == '|') {
                     seq.emplace_back("|", false);
                     i++;
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
index 9d49e8d8030..5a61dbf4918 100755
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -1734,6 +1734,58 @@ int main() {
             )""",
         });
 
+        run({
+            SUCCESS,
+            R"(regexp [\d\W] mixed pos-and-neg class)",
+            R"""({
+                "type": "string",
+                "pattern": "^[\\d\\W]+$"
+            })""",
+            R"""(
+                root ::= "\"" (([0-9] | [^a-zA-Z0-9_])+) "\"" space
+                space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            )""",
+        });
+
+        run({
+            SUCCESS,
+            R"(regexp [\s\S] any-char class)",
+            R"""({
+                "type": "string",
+                "pattern": "^[\\s\\S]+$"
+            })""",
+            R"""(
+                root ::= "\"" (([ \t\n\r] | [^ \t\n\r])+) "\"" space
+                space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            )""",
+        });
+
+        run({
+            SUCCESS,
+            R"(regexp [a-z\D] literal-and-neg class)",
+            R"""({
+                "type": "string",
+                "pattern": "^[a-z\\D]+$"
+            })""",
+            R"""(
+                root ::= "\"" (([a-z] | [^0-9])+) "\"" space
+                space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            )""",
+        });
+
+        run({
+            SUCCESS,
+            R"(regexp [\d\w\D] multi-shorthand mixed class)",
+            R"""({
+                "type": "string",
+                "pattern": "^[\\d\\w\\D]+$"
+            })""",
+            R"""(
+                root ::= "\"" (([0-9a-zA-Z0-9_] | [^0-9])+) "\"" space
+                space ::= | " " | "\n"{1,2} [ \t]{0,20}
+            )""",
+        });
+
         run({
             SUCCESS,
             R"(regexp \B boundary skipped)",