Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 68 additions & 7 deletions common/json-schema-to-grammar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -444,23 +444,70 @@ class common_schema_converter {
}
return join_seq();
} else if (c == '[') {
std::string square_brackets = std::string(1, c);
i++;
bool outer_neg = (i < length && sub_pattern[i] == '^');
if (outer_neg) i++;

// Collect positive content (literals + positive shorthands) and
// negated shorthands separately. Negated shorthands can't be inlined
// into a single bracket class alongside other content because the ^
// only means negation at position 0 of a class.
std::string literals;
std::vector<std::string> neg_parts;

while (i < length && sub_pattern[i] != ']') {
if (sub_pattern[i] == '\\') {
square_brackets += sub_pattern.substr(i, 2);
i += 2;
if (sub_pattern[i] == '\\' && i + 1 < length) {
char next = sub_pattern[i + 1];
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needs a check for i + 1 < length.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed, added i + 1 < length guard.

if (next == 'd') { literals += "0-9"; i += 2; }
else if (next == 'D') { neg_parts.push_back("0-9"); i += 2; }
else if (next == 'w') { literals += "a-zA-Z0-9_"; i += 2; }
else if (next == 'W') { neg_parts.push_back("a-zA-Z0-9_"); i += 2; }
else if (next == 's') { literals += " \\t\\n\\r"; i += 2; }
else if (next == 'S') { neg_parts.push_back(" \\t\\n\\r"); i += 2; }
else if (next == 'b' || next == 'B') { i += 2; } // no GBNF equivalent, skip
else { literals += sub_pattern.substr(i, 2); i += 2; }
} else {
square_brackets += sub_pattern[i];
literals += sub_pattern[i];
i++;
}
}
if (i >= length) {
_errors.push_back("Unbalanced square brackets");
}
square_brackets += ']';
i++;
seq.emplace_back(square_brackets, false);

if (neg_parts.empty()) {
// No negated shorthands: emit a single bracket class.
seq.emplace_back((outer_neg ? "[^" : "[") + literals + "]", false);
} else if (!outer_neg) {
// Mix of positive and negated content: emit alternation.
// e.g. [\d\W] -> ([0-9] | [^a-zA-Z0-9_])
// e.g. [\s\S] -> ([ \t\n\r] | [^ \t\n\r])
std::vector<std::string> parts;
if (!literals.empty()) {
parts.push_back("[" + literals + "]");
}
for (const std::string & neg : neg_parts) {
parts.push_back("[^" + neg + "]");
}
if (parts.size() == 1) {
seq.emplace_back(parts[0], false);
} else {
std::string alt = "(" + parts[0];
for (size_t pi = 1; pi < parts.size(); pi++) {
alt += " | " + parts[pi];
}
seq.emplace_back(alt + ")", false);
}
} else {
// Outer-negated class with negated shorthands, e.g. [^\D\w].
// Concatenate all expansions under the outer ^ (best effort).
std::string combined = literals;
for (const std::string & neg : neg_parts) {
combined += neg;
}
seq.emplace_back("[^" + combined + "]", false);
}
} else if (c == '|') {
seq.emplace_back("|", false);
i++;
Expand Down Expand Up @@ -529,6 +576,20 @@ class common_schema_converter {
i++;
literal += sub_pattern[i];
i++;
} else if (next == 'd' || next == 'D') {
if (!literal.empty()) { seq.emplace_back(literal, true); literal = ""; }
seq.emplace_back(next == 'd' ? "[0-9]" : "[^0-9]", false);
i += 2;
} else if (next == 'w' || next == 'W') {
if (!literal.empty()) { seq.emplace_back(literal, true); literal = ""; }
seq.emplace_back(next == 'w' ? "[a-zA-Z0-9_]" : "[^a-zA-Z0-9_]", false);
i += 2;
} else if (next == 's' || next == 'S') {
if (!literal.empty()) { seq.emplace_back(literal, true); literal = ""; }
seq.emplace_back(next == 's' ? "[ \\t\\n\\r]" : "[^ \\t\\n\\r]", false);
i += 2;
} else if (next == 'b' || next == 'B') {
i += 2; // word boundary - no GBNF equivalent, skip
} else {
literal += sub_pattern.substr(i, 2);
i += 2;
Expand Down
234 changes: 234 additions & 0 deletions tests/test-json-schema-to-grammar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1564,6 +1564,240 @@ int main() {
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});

run({
SUCCESS,
R"(regexp \d shorthand)",
R"""({
"type": "string",
"pattern": "^\\d+$"
})""",
R"""(
root ::= "\"" ([0-9]+) "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});

run({
SUCCESS,
R"(regexp [\d] class)",
R"""({
"type": "string",
"pattern": "^[\\d]+$"
})""",
R"""(
root ::= "\"" ([0-9]+) "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});

run({
SUCCESS,
R"(regexp \w shorthand)",
R"""({
"type": "string",
"pattern": "^\\w+$"
})""",
R"""(
root ::= "\"" ([a-zA-Z0-9_]+) "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});

run({
SUCCESS,
R"(regexp [\w] class)",
R"""({
"type": "string",
"pattern": "^[\\w]+$"
})""",
R"""(
root ::= "\"" ([a-zA-Z0-9_]+) "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});

run({
SUCCESS,
R"(regexp \s shorthand)",
R"""({
"type": "string",
"pattern": "^\\s+$"
})""",
R"""(
root ::= "\"" ([ \t\n\r]+) "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});

run({
SUCCESS,
R"(regexp [\s] class)",
R"""({
"type": "string",
"pattern": "^[\\s]+$"
})""",
R"""(
root ::= "\"" ([ \t\n\r]+) "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});

run({
SUCCESS,
R"(regexp \b boundary skipped)",
R"""({
"type": "string",
"pattern": "^\\bfoo\\b$"
})""",
R"""(
root ::= "\"" ("foo") "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});

run({
SUCCESS,
R"(regexp \D shorthand)",
R"""({
"type": "string",
"pattern": "^\\D+$"
})""",
R"""(
root ::= "\"" ([^0-9]+) "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});

run({
SUCCESS,
R"(regexp [\D] class)",
R"""({
"type": "string",
"pattern": "^[\\D]+$"
})""",
R"""(
root ::= "\"" ([^0-9]+) "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});

run({
SUCCESS,
R"(regexp \W shorthand)",
R"""({
"type": "string",
"pattern": "^\\W+$"
})""",
R"""(
root ::= "\"" ([^a-zA-Z0-9_]+) "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});

run({
SUCCESS,
R"(regexp [\W] class)",
R"""({
"type": "string",
"pattern": "^[\\W]+$"
})""",
R"""(
root ::= "\"" ([^a-zA-Z0-9_]+) "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});

run({
SUCCESS,
R"(regexp \S shorthand)",
R"""({
"type": "string",
"pattern": "^\\S+$"
})""",
R"""(
root ::= "\"" ([^ \t\n\r]+) "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});

run({
SUCCESS,
R"(regexp [\S] class)",
R"""({
"type": "string",
"pattern": "^[\\S]+$"
})""",
R"""(
root ::= "\"" ([^ \t\n\r]+) "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});

run({
SUCCESS,
R"(regexp [\d\W] mixed pos-and-neg class)",
R"""({
"type": "string",
"pattern": "^[\\d\\W]+$"
})""",
R"""(
root ::= "\"" (([0-9] | [^a-zA-Z0-9_])+) "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});

run({
SUCCESS,
R"(regexp [\s\S] any-char class)",
R"""({
"type": "string",
"pattern": "^[\\s\\S]+$"
})""",
R"""(
root ::= "\"" (([ \t\n\r] | [^ \t\n\r])+) "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});

run({
SUCCESS,
R"(regexp [a-z\D] literal-and-neg class)",
R"""({
"type": "string",
"pattern": "^[a-z\\D]+$"
})""",
R"""(
root ::= "\"" (([a-z] | [^0-9])+) "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});

run({
SUCCESS,
R"(regexp [\d\w\D] multi-shorthand mixed class)",
R"""({
"type": "string",
"pattern": "^[\\d\\w\\D]+$"
})""",
R"""(
root ::= "\"" (([0-9a-zA-Z0-9_] | [^0-9])+) "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});

run({
SUCCESS,
R"(regexp \B boundary skipped)",
R"""({
"type": "string",
"pattern": "^\\Bfoo\\B$"
})""",
R"""(
root ::= "\"" ("foo") "\"" space
space ::= | " " | "\n"{1,2} [ \t]{0,20}
)""",
});
}

if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) {
Expand Down