Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ICU-22898 MF2: fix various parser bugs and add more tests #3092

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
620 changes: 321 additions & 299 deletions icu4c/source/i18n/messageformat2_parser.cpp

Large diffs are not rendered by default.

13 changes: 12 additions & 1 deletion icu4c/source/i18n/messageformat2_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,21 @@ namespace message2 {
SelectorKeys parseNonEmptyKeys(UErrorCode&);
void errorPattern(UErrorCode& status);
Pattern parseQuotedPattern(UErrorCode&);
bool isDeclarationStart();

UChar32 peek() const { return source.char32At(index) ; }
UChar32 peek(uint32_t i) const {
return source.char32At(source.moveIndex32(index, i));
}
void next() { index = source.moveIndex32(index, 1); }

bool inBounds() const { return (int32_t) index < source.length(); }
bool inBounds(uint32_t i) const { return source.moveIndex32(index, i) < source.length(); }
bool allConsumed() const { return (int32_t) index == source.length(); }

// The input string
const UnicodeString &source;
// The current position within the input string
// The current position within the input string -- counting in UChar32
uint32_t index;
// Represents the current line (and when an error is indicated),
// character offset within the line of the parse error
Expand Down
29 changes: 16 additions & 13 deletions icu4c/source/i18n/messageformat2_serializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,24 +42,26 @@ void Serializer::emit(const std::u16string_view& token) {
void Serializer::emit(const Literal& l) {
if (l.isQuoted()) {
emit(PIPE);
const UnicodeString& contents = l.unquoted();
for (int32_t i = 0; i < contents.length(); i++) {
// Re-escape any PIPE or BACKSLASH characters
}
const UnicodeString& contents = l.unquoted();
for (int32_t i = 0; ((int32_t) i) < contents.length(); i++) {
// Re-escape any escaped-char characters
switch(contents[i]) {
case BACKSLASH:
case PIPE: {
emit(BACKSLASH);
break;
case PIPE:
case LEFT_CURLY_BRACE:
case RIGHT_CURLY_BRACE: {
emit(BACKSLASH);
break;
}
default: {
break;
break;
}
}
emit(contents[i]);
}
emit(PIPE);
} else {
emit(l.unquoted());
}
if (l.isQuoted()) {
emit(PIPE);
}
}

Expand Down Expand Up @@ -194,9 +196,10 @@ void Serializer::emit(const PatternPart& part) {
if (part.isText()) {
// Raw text
const UnicodeString& text = part.asText();
// Re-escape '{'/'}'/'\'
for (int32_t i = 0; i < text.length(); i++) {
// Re-escape '{'/'}'/'\''|'
for (int32_t i = 0; ((int32_t) i) < text.length(); i++) {
switch(text[i]) {
case PIPE:
case BACKSLASH:
case LEFT_CURLY_BRACE:
case RIGHT_CURLY_BRACE: {
Expand Down
2 changes: 2 additions & 0 deletions icu4c/source/test/intltest/messageformat2test_read_json.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,8 @@ void TestMessageFormat2::jsonTestsFromFiles(IcuTestErrorCode& errorCode) {
// Do tests for reserved statements/expressions
runTestsFromJsonFile(*this, "spec/unsupported-expressions.json", errorCode);
runTestsFromJsonFile(*this, "spec/unsupported-statements.json", errorCode);
// Uncomment when reserved syntax is removed
// runTestsFromJsonFile(*this, "syntax-errors-reserved.json", errorCode);

// Do valid spec tests
runTestsFromJsonFile(*this, "spec/syntax.json", errorCode);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -613,10 +613,8 @@ private MFDataModel.Variant getVariant() throws MFParseException {
}
keys.add(key);
}
// Only want to skip whitespace if we parsed at least one key
if (!keys.isEmpty()) {
skipOptionalWhitespaces();
}
// Trailing whitespace is allowed after the message
skipOptionalWhitespaces();
if (input.atEnd()) {
checkCondition(
keys.isEmpty(), "After selector keys it is mandatory to have a pattern.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ public class CoreTest extends CoreTestFmwk {
"syntax-errors-diagnostics.json",
"syntax-errors-diagnostics-multiline.json",
"syntax-errors-end-of-input.json",
"syntax-errors-reserved.json",
"tricky-declarations.json",
"valid-tests.json"};

Expand Down
9 changes: 4 additions & 5 deletions testdata/message2/syntax-errors-diagnostics.json
Original file line number Diff line number Diff line change
Expand Up @@ -136,11 +136,6 @@
"char": 28,
"comment": "Trailing characters that are not whitespace"
},
{
"src": ".match {$foo :string} {$bar :string} one * {{one}} * * {{other}} ",
"char": 66,
"comment": "Trailing whitespace at end of message should not be accepted either"
},
{
"src": "empty { }",
"char": 8,
Expand Down Expand Up @@ -343,6 +338,10 @@
"src": ".match {1} {{_}}",
"char": 12,
"comment": "Disambiguating a wrong .match from an unsupported statement"
},
{
"src": "{{{/p o4.􍅲 = 1}}}",
"char": 9
}
]
}
22 changes: 22 additions & 0 deletions testdata/message2/syntax-errors-reserved.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"scenario": "Reserved and private annotations",
"description": "Tests for old unsupported expression (reserved/private) syntax -- these are now syntax errors",
"defaultTestProperties": {
"locale": "en-US",
"expErrors": [
{
"type": "syntax-error"
}
]
},
"tests": [
{ "src" : "\\\\{ ? 󗟋 𫓜|@} |}" },
{ "src" : "\\\\{ ? 󗟋 𫓜|@} | \\} @𠟅򑚎𥪙𽧫=|| @򒘒򳷦㥞򉊷=$򸚶񽱆񅗽񤕞 @𰺱:񎫛񢶛򶈎񄮒}" },
{ "src" : "{ $iFN ^ @USh =$u @l}" },
{ "src" : ".local $dS ={ $p4 ^ |.| \\\\ @g:FV = $kd}{{@}}" },
{ "src" : ".D. \\\\ ||{1}{{}} " },
{ "src" : ".cIT ||{|@| % \\} } { *󔜫񥘃󸇀 }{{}}" },
{ "src" : ".򱋿󠆿 . |@| {$󛒁񓝖 & \\{ . @𯼼򋼡= $򵀀񓞈}{{\\\\}}" }
]
}

28 changes: 28 additions & 0 deletions testdata/message2/valid-tests.json
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,34 @@
"comment": "message -> complex-message -> *(declaration [s]) complex-body -> declaration complex-body -> input-declaration complex-body -> input variable-expression complex-body",
"src": ".input{$x}{{}}",
"exp": ""
},
{
"comment": "Test that escaped characters are re-escaped properly when serializing literals",
"src": "{|\\}|}",
"exp": "}"
},
{
"comment": "Test that escaped characters are re-escaped properly when serializing patterns",
"src": "\\|",
"exp": "|"
},
{
"comment": "Test that parser and serializer treat optionally-escaped characters consistently",
"src" : "{{{1}|}}",
"exp": "1|"
},
{
"comment": "Trailing whitespace after match is valid",
"src": ".match {1 :string} * {{}} ",
"exp": ""
},
{
"src" : "𴢸",
"exp" : "𴢸"
},
{
"src" : "{{􍅲}}",
"exp" : "􍅲"
}
]
}