From 4b376f13642529939d9c59175fcc8e1527de7d55 Mon Sep 17 00:00:00 2001 From: Tim Chevalier Date: Tue, 30 Jul 2024 16:16:05 -0700 Subject: [PATCH] ICU-22898 MF2 parser bug fixes ICU4C: Escape curly braces when serializing and normalizing ICU4C: Escape '|' in patterns ICU4C: When normalizing input, escape optionally-escaped characters in patterns ICU4C/ICU4J: Allow trailing whitespace after a match ICU4C: Fix parser to iterate over code points, not code units Add tests with old reserved syntax as syntax-error tests --- icu4c/source/i18n/messageformat2_parser.cpp | 620 +++++++++--------- icu4c/source/i18n/messageformat2_parser.h | 13 +- .../source/i18n/messageformat2_serializer.cpp | 29 +- .../intltest/messageformat2test_read_json.cpp | 2 + .../java/com/ibm/icu/message2/MFParser.java | 6 +- .../ibm/icu/dev/test/message2/CoreTest.java | 1 + .../message2/syntax-errors-diagnostics.json | 9 +- testdata/message2/syntax-errors-reserved.json | 22 + testdata/message2/valid-tests.json | 28 + 9 files changed, 408 insertions(+), 322 deletions(-) create mode 100644 testdata/message2/syntax-errors-reserved.json diff --git a/icu4c/source/i18n/messageformat2_parser.cpp b/icu4c/source/i18n/messageformat2_parser.cpp index 462ee6cc22eb..ad11324a964d 100644 --- a/icu4c/source/i18n/messageformat2_parser.cpp +++ b/icu4c/source/i18n/messageformat2_parser.cpp @@ -24,21 +24,22 @@ using namespace data_model; The `ERROR()` macro sets a syntax error in the context and sets the offset in `parseError` to `index`. It does not alter control flow. */ -#define ERROR(parseError, errorCode, index) \ +#define ERROR(errorCode) \ if (!errors.hasSyntaxError()) { \ setParseError(parseError, index); \ errors.addSyntaxError(errorCode); \ } -// Returns true iff `index` is a valid index for the string `source` -static bool inBounds(const UnicodeString &source, uint32_t index) { - return static_cast(index) < source.length(); -} +#define ERROR_AT(errorCode, i) \ + if (!errors.hasSyntaxError()) { \ + setParseError(parseError, i); \ + errors.addSyntaxError(errorCode); \ + } // Increments the line number and updates the "characters seen before -// current line" count in `parseError`, iff `source[index]` is a newline +// current line" count in `parseError`, iff `peek()` is a newline void Parser::maybeAdvanceLine() { - if (source[index] == LF) { + if (peek() == LF) { parseError.line++; // add 1 to index to get the number of characters seen so far // (including the newline) @@ -50,10 +51,15 @@ void Parser::maybeAdvanceLine() { Signals an error and returns either if `parseError` already denotes an error, or `index` is out of bounds for the string `source` */ -#define CHECK_BOUNDS(source, index, parseError, errorCode) \ - if (!inBounds(source, index)) { \ - ERROR(parseError, errorCode, index); \ - return; \ +#define CHECK_BOUNDS(errorCode) \ + if (!inBounds()) { \ + ERROR(errorCode); \ + return; \ + } +#define CHECK_BOUNDS_1(errorCode) \ + if (!inBounds(1)) { \ + ERROR_AT(errorCode, index + 1); \ + return; \ } // ------------------------------------- @@ -203,6 +209,13 @@ static bool isQuotedChar(UChar32 c) { || c == RIGHT_CURLY_BRACE; } +static bool isEscapableChar(UChar32 c) { + return c == PIPE + || c == BACKSLASH + || c == LEFT_CURLY_BRACE + || c == RIGHT_CURLY_BRACE; +} + // Returns true iff `c` can begin a `function` nonterminal static bool isFunctionStart(UChar32 c) { switch (c) { @@ -247,15 +260,13 @@ static bool isKeyStart(UChar32 c) { return (c == ASTERISK || isLiteralStart(c)); } -inline bool isDeclarationStart(const UnicodeString& source, int32_t index) { - int32_t len = source.length(); - int32_t next = index + 1; - return (source[index] == ID_LOCAL[0] - && next < len - && source[next] == ID_LOCAL[1]) - || (source[index] == ID_INPUT[0] - && next < len - && source[next] == ID_INPUT[1]); +bool Parser::isDeclarationStart() { + return (peek() == ID_LOCAL[0] + && inBounds(1) + && peek(1) == ID_LOCAL[1]) + || (peek() == ID_INPUT[0] + && inBounds(1) + && peek(1) == ID_INPUT[1]); } // ------------------------------------- @@ -375,14 +386,14 @@ option, or the optional space before an attribute. Unless otherwise noted in a comment, all helper functions that take a `source` string, an `index` unsigned int, and an `errorCode` `UErrorCode` have the precondition: - `index` < `source.length()` + `index` < `len()` and the postcondition: - `U_FAILURE(errorCode)` || `index < `source.length()` + `U_FAILURE(errorCode)` || `index < `len()` */ /* No pre, no post. - A message may end with whitespace, so `index` may equal `source.length()` on exit. + A message may end with whitespace, so `index` may equal `len()` on exit. */ void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode) { bool sawWhitespace = false; @@ -391,7 +402,7 @@ void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode) // or when we see a non-whitespace character. while (true) { // Check if all input has been consumed - if (!inBounds(source, index)) { + if (!inBounds()) { // If whitespace isn't required -- or if we saw it already -- // then the caller is responsible for checking this case and // setting an error if necessary. @@ -401,24 +412,24 @@ void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode) } // Otherwise, whitespace is required; the end of the input has // been reached without whitespace. This is an error. - ERROR(parseError, errorCode, index); + ERROR(errorCode); return; } // Input remains; process the next character if it's whitespace, // exit the loop otherwise - if (isWhitespace(source[index])) { + if (isWhitespace(peek())) { sawWhitespace = true; // Increment line number in parse error if we consume a newline maybeAdvanceLine(); - index++; + next(); } else { break; } } if (!sawWhitespace && required) { - ERROR(parseError, errorCode, index); + ERROR(errorCode); } } @@ -437,36 +448,36 @@ void Parser::parseOptionalWhitespace(UErrorCode& errorCode) { parseWhitespaceMaybeRequired(false, errorCode); } -// Consumes a single character, signaling an error if `source[index]` != `c` +// Consumes a single character, signaling an error if `peek()` != `c` // No postcondition -- a message can end with a '}' token void Parser::parseToken(UChar32 c, UErrorCode& errorCode) { - CHECK_BOUNDS(source, index, parseError, errorCode); + CHECK_BOUNDS(errorCode); - if (source[index] == c) { - index++; + if (peek() == c) { + next(); normalizedInput += c; return; } // Next character didn't match -- error out - ERROR(parseError, errorCode, index); + ERROR(errorCode); } /* Consumes a fixed-length token, signaling an error if the token isn't a prefix of - the string beginning at `source[index]` + the string beginning at `peek()` No postcondition -- a message can end with a '}' token */ void Parser::parseToken(const std::u16string_view& token, UErrorCode& errorCode) { - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); int32_t tokenPos = 0; while (tokenPos < static_cast(token.length())) { - if (source[index] != token[tokenPos]) { - ERROR(parseError, errorCode, index); + if (peek() != token[tokenPos]) { + ERROR(errorCode); return; } normalizedInput += token[tokenPos]; - index++; + next(); tokenPos++; } } @@ -481,11 +492,11 @@ void Parser::parseTokenWithWhitespace(const std::u16string_view& token, UErrorCo // No need for error check or bounds check before parseOptionalWhitespace parseOptionalWhitespace(errorCode); // Establish precondition - CHECK_BOUNDS(source, index, parseError, errorCode); + CHECK_BOUNDS(errorCode); parseToken(token, errorCode); parseOptionalWhitespace(errorCode); // Guarantee postcondition - CHECK_BOUNDS(source, index, parseError, errorCode); + CHECK_BOUNDS(errorCode); } /* @@ -498,11 +509,11 @@ void Parser::parseTokenWithWhitespace(UChar32 c, UErrorCode& errorCode) { // No need for error check or bounds check before parseOptionalWhitespace(errorCode) parseOptionalWhitespace(errorCode); // Establish precondition - CHECK_BOUNDS(source, index, parseError, errorCode); + CHECK_BOUNDS(errorCode); parseToken(c, errorCode); parseOptionalWhitespace(errorCode); // Guarantee postcondition - CHECK_BOUNDS(source, index, parseError, errorCode); + CHECK_BOUNDS(errorCode); } /* @@ -517,19 +528,20 @@ void Parser::parseTokenWithWhitespace(UChar32 c, UErrorCode& errorCode) { UnicodeString Parser::parseName(UErrorCode& errorCode) { UnicodeString name; - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); - if (!isNameStart(source[index])) { - ERROR(parseError, errorCode, index); + if (!isNameStart(peek())) { + ERROR(errorCode); return name; } - while (isNameChar(source[index])) { - name += source[index]; - normalizedInput += source[index]; - index++; - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + while (isNameChar(peek())) { + UChar32 c = peek(); + name += c; + normalizedInput += c; + next(); + if (!inBounds()) { + ERROR(errorCode); break; } } @@ -545,13 +557,13 @@ UnicodeString Parser::parseName(UErrorCode& errorCode) { VariableName Parser::parseVariableName(UErrorCode& errorCode) { VariableName result; - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); // If the '$' is missing, we don't want a binding // for this variable to be created. - bool valid = source[index] == DOLLAR; + bool valid = peek() == DOLLAR; parseToken(DOLLAR, errorCode); - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); return result; } UnicodeString varName = parseName(errorCode); @@ -567,7 +579,7 @@ VariableName Parser::parseVariableName(UErrorCode& errorCode) { Corresponds to the `identifier` nonterminal in the grammar */ UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) { - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); UnicodeString result; // The following is a hack to get around ambiguity in the grammar: @@ -582,7 +594,7 @@ UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) { // Parse namespace result += parseName(errorCode); int32_t firstColon = -1; - while (inBounds(source, index) && source[index] == COLON) { + while (inBounds() && peek() == COLON) { // Parse ':' separator if (firstColon == -1) { firstColon = index; @@ -590,8 +602,8 @@ UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) { parseToken(COLON, errorCode); result += COLON; // Check for message ending with something like "foo:" - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); } else { // Parse name part result += parseName(errorCode); @@ -603,7 +615,7 @@ UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) { if (firstColon != -1) { for (int32_t i = firstColon + 1; i < result.length(); i++) { if (result[i] == COLON) { - ERROR(parseError, errorCode, i); + ERROR_AT(errorCode, i); return {}; } } @@ -619,16 +631,16 @@ UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) { Returns the function name. */ FunctionName Parser::parseFunction(UErrorCode& errorCode) { - U_ASSERT(inBounds(source, index)); - if (!isFunctionStart(source[index])) { - ERROR(parseError, errorCode, index); + U_ASSERT(inBounds()); + if (!isFunctionStart(peek())) { + ERROR(errorCode); return FunctionName(); } - normalizedInput += source[index]; - index++; // Consume the function start character - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + normalizedInput += peek(); + next(); // Consume the function start character + if (!inBounds()) { + ERROR(errorCode); return FunctionName(); } return parseIdentifier(errorCode); @@ -636,7 +648,7 @@ FunctionName Parser::parseFunction(UErrorCode& errorCode) { /* - Precondition: source[index] == BACKSLASH + Precondition: peek() == BACKSLASH Consume an escaped character. Corresponds to `escaped-char` in the grammar. @@ -644,24 +656,24 @@ FunctionName Parser::parseFunction(UErrorCode& errorCode) { No postcondition (a message can end with an escaped char) */ UnicodeString Parser::parseEscapeSequence(UErrorCode& errorCode) { - U_ASSERT(inBounds(source, index)); - U_ASSERT(source[index] == BACKSLASH); + U_ASSERT(inBounds()); + U_ASSERT(peek() == BACKSLASH); normalizedInput += BACKSLASH; - index++; // Skip the initial backslash + next(); // Skip the initial backslash UnicodeString str; - if (inBounds(source, index)) { + if (inBounds()) { // Expect a '{', '|' or '}' - switch (source[index]) { + switch (peek()) { case LEFT_CURLY_BRACE: case RIGHT_CURLY_BRACE: case PIPE: case BACKSLASH: { /* Append to the output string */ - str += source[index]; + str += peek(); /* Update normalizedInput */ - normalizedInput += source[index]; + normalizedInput += peek(); /* Consume the character */ - index++; + next(); return str; } default: { @@ -671,7 +683,7 @@ UnicodeString Parser::parseEscapeSequence(UErrorCode& errorCode) { } } // If control reaches here, there was an error - ERROR(parseError, errorCode, index); + ERROR(errorCode); return str; } @@ -686,26 +698,34 @@ Literal Parser::parseQuotedLiteral(UErrorCode& errorCode) { if (U_SUCCESS(errorCode)) { // Parse the opening '|' parseToken(PIPE, errorCode); - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); error = true; } else { // Parse the contents bool done = false; while (!done) { - if (source[index] == BACKSLASH) { + if (peek() == BACKSLASH) { contents += parseEscapeSequence(errorCode); - } else if (isQuotedChar(source[index])) { - contents += source[index]; - normalizedInput += source[index]; - index++; // Consume this character + } else if (isQuotedChar(peek())) { + contents += peek(); + // Handle cases like: + // |}{| -- we want to escape everywhere that + // can be escaped, to make round-trip checking + // easier -- so this case normalizes to + // |\}\{| + if (isEscapableChar(peek())) { + normalizedInput += BACKSLASH; + } + normalizedInput += peek(); + next(); // Consume this character maybeAdvanceLine(); } else { // Assume the sequence of literal characters ends here done = true; } - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); error = true; break; } @@ -729,18 +749,18 @@ UnicodeString Parser::parseDigits(UErrorCode& errorCode) { return {}; } - U_ASSERT(isDigit(source[index])); + U_ASSERT(isDigit(peek())); UnicodeString contents; do { - contents += source[index]; - normalizedInput += source[index]; - index++; - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + contents += peek(); + normalizedInput += peek(); + next(); + if (!inBounds()) { + ERROR(errorCode); return {}; } - } while (isDigit(source[index])); + } while (isDigit(peek())); return contents; } @@ -753,7 +773,7 @@ Literal Parser::parseUnquotedLiteral(UErrorCode& errorCode) { } // unquoted -> name - if (isNameStart(source[index])) { + if (isNameStart(peek())) { return Literal(false, parseName(errorCode)); } @@ -762,75 +782,75 @@ Literal Parser::parseUnquotedLiteral(UErrorCode& errorCode) { UnicodeString contents; // Parse the sign - if (source[index] == HYPHEN) { - contents += source[index]; - normalizedInput += source[index]; - index++; + if (peek() == HYPHEN) { + contents += peek(); + normalizedInput += peek(); + next(); } - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); return {}; } // Parse the integer part - if (source[index] == static_cast(0x0030) /* 0 */) { - contents += source[index]; - normalizedInput += source[index]; - index++; - } else if (isDigit(source[index])) { + if (peek() == ((UChar32)0x0030) /* 0 */) { + contents += peek(); + normalizedInput += peek(); + next(); + } else if (isDigit(peek())) { contents += parseDigits(errorCode); } else { // Error -- nothing else can start a number literal - ERROR(parseError, errorCode, index); + ERROR(errorCode); return {}; } // Parse the decimal point if present - if (source[index] == PERIOD) { - contents += source[index]; - normalizedInput += source[index]; - index++; - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (peek() == PERIOD) { + contents += peek(); + normalizedInput += peek(); + next(); + if (!inBounds()) { + ERROR(errorCode); return {}; } // Parse the fraction part - if (isDigit(source[index])) { + if (isDigit(peek())) { contents += parseDigits(errorCode); } else { // '.' not followed by digit is a parse error - ERROR(parseError, errorCode, index); + ERROR(errorCode); return {}; } } - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); return {}; } // Parse the exponent part if present - if (source[index] == UPPERCASE_E || source[index] == LOWERCASE_E) { - contents += source[index]; - normalizedInput += source[index]; - index++; - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (peek() == UPPERCASE_E || peek() == LOWERCASE_E) { + contents += peek(); + normalizedInput += peek(); + next(); + if (!inBounds()) { + ERROR(errorCode); return {}; } // Parse sign if present - if (source[index] == PLUS || source[index] == HYPHEN) { - contents += source[index]; - normalizedInput += source[index]; - index++; - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (peek() == PLUS || peek() == HYPHEN) { + contents += peek(); + normalizedInput += peek(); + next(); + if (!inBounds()) { + ERROR(errorCode); return {}; } } // Parse exponent digits - if (!isDigit(source[index])) { - ERROR(parseError, errorCode, index); + if (!isDigit(peek())) { + ERROR(errorCode); return {}; } contents += parseDigits(errorCode); @@ -844,17 +864,17 @@ Literal Parser::parseUnquotedLiteral(UErrorCode& errorCode) { */ Literal Parser::parseLiteral(UErrorCode& errorCode) { Literal result; - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); } else { - if (source[index] == PIPE) { + if (peek() == PIPE) { result = parseQuotedLiteral(errorCode); } else { result = parseUnquotedLiteral(errorCode); } // Guarantee postcondition - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); } } @@ -868,9 +888,9 @@ Literal Parser::parseLiteral(UErrorCode& errorCode) { */ template void Parser::parseAttribute(AttributeAdder& attrAdder, UErrorCode& errorCode) { - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); - U_ASSERT(source[index] == AT); + U_ASSERT(peek() == AT); // Consume the '@' parseToken(AT, errorCode); @@ -884,13 +904,13 @@ void Parser::parseAttribute(AttributeAdder& attrAdder, UErrorCode& errorCode) parseOptionalWhitespace(errorCode); Operand rand; - if (source[index] == EQUALS) { + if (peek() == EQUALS) { // Parse '=' parseTokenWithWhitespace(EQUALS, errorCode); UnicodeString rhsStr; // Parse RHS, which is either a literal or variable - switch (source[index]) { + switch (peek()) { case DOLLAR: { rand = Operand(parseVariableName(errorCode)); break; @@ -919,7 +939,7 @@ void Parser::parseAttribute(AttributeAdder& attrAdder, UErrorCode& errorCode) */ template void Parser::parseOption(OptionAdder& addOption, UErrorCode& errorCode) { - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); // Parse LHS UnicodeString lhs = parseIdentifier(errorCode); @@ -930,7 +950,7 @@ void Parser::parseOption(OptionAdder& addOption, UErrorCode& errorCode) { UnicodeString rhsStr; Operand rand; // Parse RHS, which is either a literal or variable - switch (source[index]) { + switch (peek()) { case DOLLAR: { rand = Operand(parseVariableName(errorCode)); break; @@ -967,7 +987,7 @@ void Parser::parseOption(OptionAdder& addOption, UErrorCode& errorCode) { template void Parser::parseOptions(OptionAdder& addOption, UErrorCode& errorCode) { // Early exit if out of bounds -- no more work is possible - CHECK_BOUNDS(source, index, parseError, errorCode); + CHECK_BOUNDS(errorCode); /* Arbitrary lookahead is required to parse option lists. To see why, consider @@ -1015,7 +1035,7 @@ an option or an attribute. // If the next character is not whitespace, that means we've already // parsed the entire options list (which may have been empty) and there's // no trailing whitespace. In that case, exit. - if (!isWhitespace(source[index])) { + if (!isWhitespace(peek())) { break; } int32_t firstWhitespace = index; @@ -1024,7 +1044,7 @@ an option or an attribute. // one whitespace character. parseRequiredWhitespace(errorCode); // Restore precondition - CHECK_BOUNDS(source, index, parseError, errorCode); + CHECK_BOUNDS(errorCode); // If a name character follows, then at least one more option remains // in the list. @@ -1032,7 +1052,7 @@ an option or an attribute. // and can exit. // Note that exiting is sort of like backtracking: "(s option)" doesn't apply, // so we back out to [s]. - if (!isNameStart(source[index])) { + if (!isNameStart(peek())) { // We've consumed all the options (meaning that either we consumed non-empty // whitespace, or consumed at least one option.) // Done. @@ -1055,8 +1075,8 @@ template void Parser::parseAttributes(AttributeAdder& attrAdder, UErrorCode& errorCode) { // Early exit if out of bounds -- no more work is possible - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); return; } @@ -1069,7 +1089,7 @@ Arbitrary lookahead is required to parse attribute lists, similarly to option li // If the next character is not whitespace, that means we've already // parsed the entire attributes list (which may have been empty) and there's // no trailing whitespace. In that case, exit. - if (!isWhitespace(source[index])) { + if (!isWhitespace(peek())) { break; } @@ -1077,8 +1097,8 @@ Arbitrary lookahead is required to parse attribute lists, similarly to option li // one whitespace character. parseRequiredWhitespace(errorCode); // Restore precondition - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); break; } @@ -1088,7 +1108,7 @@ Arbitrary lookahead is required to parse attribute lists, similarly to option li // and can exit. // Note that exiting is sort of like backtracking: "(s attributes)" doesn't apply, // so we back out to [s]. - if (source[index] != AT) { + if (peek() != AT) { // We've consumed all the attributes (meaning that either we consumed non-empty // whitespace, or consumed at least one attribute.) // Done. @@ -1111,16 +1131,16 @@ void Parser::parseReservedChunk(Reserved::Builder& result, UErrorCode& status) { bool empty = true; UnicodeString chunk; - while(reservedChunkFollows(source[index])) { + while(reservedChunkFollows(peek())) { empty = false; // reserved-char - if (isReservedChar(source[index])) { - chunk += source[index]; - normalizedInput += source[index]; + if (isReservedChar(peek())) { + chunk += peek(); + normalizedInput += peek(); // consume the char - index++; + next(); // Restore precondition - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); continue; } @@ -1129,11 +1149,11 @@ void Parser::parseReservedChunk(Reserved::Builder& result, UErrorCode& status) { chunk.setTo(u"", 0); } - if (source[index] == BACKSLASH) { + if (peek() == BACKSLASH) { // reserved-escape result.add(Literal(false, parseEscapeSequence(status)), status); chunk.setTo(u"", 0); - } else if (source[index] == PIPE || isUnquotedStart(source[index])) { + } else if (peek() == PIPE || isUnquotedStart(peek())) { result.add(parseLiteral(status), status); } else { // The reserved chunk ends here @@ -1149,7 +1169,7 @@ void Parser::parseReservedChunk(Reserved::Builder& result, UErrorCode& status) { } if (empty) { - ERROR(parseError, status, index); + ERROR(status); } } @@ -1166,23 +1186,23 @@ Reserved Parser::parseReserved(UErrorCode& status) { return {}; } - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); // Require a `reservedStart` character - if (!isReservedStart(source[index])) { - ERROR(parseError, status, index); + if (!isReservedStart(peek())) { + ERROR(status); return Reserved(); } // Add the start char as a separate text chunk - UnicodeString firstCharString(source[index]); + UnicodeString firstCharString(peek()); builder.add(Literal(false, firstCharString), status); if (U_FAILURE(status)) { return {}; } // Consume reservedStart - normalizedInput += source[index]; - index++; + normalizedInput += peek(); + next(); return parseReservedBody(builder, status); } @@ -1226,42 +1246,42 @@ Reserved Parser::parseReservedBody(Reserved::Builder& builder, UErrorCode& statu (`parseExpression()` is responsible for checking that the next character is in fact a '}'.) */ - if (!inBounds(source, index)) { + if (!inBounds()) { break; } int32_t numWhitespaceChars = 0; int32_t savedIndex = index; - if (isWhitespace(source[index])) { + if (isWhitespace(peek())) { parseOptionalWhitespace(status); numWhitespaceChars = index - savedIndex; // Restore precondition - if (!inBounds(source, index)) { + if (!inBounds()) { break; } } - if (reservedChunkFollows(source[index])) { + if (reservedChunkFollows(peek())) { parseReservedChunk(builder, status); // Avoid looping infinitely - if (U_FAILURE(status) || !inBounds(source, index)) { + if (U_FAILURE(status) || !inBounds()) { break; } } else { if (numWhitespaceChars > 0) { - if (source[index] == LEFT_CURLY_BRACE) { + if (peek() == LEFT_CURLY_BRACE) { // Resolve even more ambiguity (space preceding another piece of // a `reserved-body`, vs. space preceding an expression in `reserved-statement` // "Backtrack" index -= numWhitespaceChars; break; } - if (source[index] == RIGHT_CURLY_BRACE) { + if (peek() == RIGHT_CURLY_BRACE) { // Not an error: just means there's no trailing whitespace // after this `reserved` break; } - if (source[index] == AT) { + if (peek() == AT) { // Not an error, but we have to "backtrack" due to the ambiguity // between an `s` preceding another reserved chunk // and an `s` preceding an attribute list @@ -1270,7 +1290,7 @@ Reserved Parser::parseReservedBody(Reserved::Builder& builder, UErrorCode& statu } // Error: if there's whitespace, it must either be followed // by a non-empty sequence or by '}' - ERROR(parseError, status, index); + ERROR(status); break; } // If there was no whitespace, it's not an error, @@ -1289,12 +1309,12 @@ Reserved Parser::parseReservedBody(Reserved::Builder& builder, UErrorCode& statu Returns an `Operator` representing this (a reserved is a parse error) */ Operator Parser::parseAnnotation(UErrorCode& status) { - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); Operator::Builder ratorBuilder(status); if (U_FAILURE(status)) { return {}; } - if (isFunctionStart(source[index])) { + if (isFunctionStart(peek())) { // Consume the function name FunctionName func = parseFunction(status); ratorBuilder.setFunctionName(std::move(func)); @@ -1326,7 +1346,7 @@ void Parser::parseLiteralOrVariableWithAnnotation(bool isVariable, UErrorCode& status) { CHECK_ERROR(status); - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); Operand rand; if (isVariable) { @@ -1360,7 +1380,7 @@ refactoring for the `expression` nonterminal that `parseOptions()` relies on. Se the comment in `parseOptions()` for details. */ - if (isWhitespace(source[index])) { + if (isWhitespace(peek())) { int32_t firstWhitespace = index; // If the next character is whitespace, either [s annotation] or [s] applies @@ -1369,10 +1389,10 @@ the comment in `parseOptions()` for details. // one does apply. parseOptionalWhitespace(status); // Restore precondition - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); // This next check resolves the ambiguity between [s annotation] and [s] - bool isSAnnotation = isAnnotationStart(source[index]); + bool isSAnnotation = isAnnotationStart(peek()); if (isSAnnotation) { normalizedInput += SPACE; @@ -1433,7 +1453,7 @@ Expression Parser::parseExpression(UErrorCode& status) { } // Early return if out of input -- no more work is possible - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); // Parse opening brace parseToken(LEFT_CURLY_BRACE, status); @@ -1442,11 +1462,11 @@ Expression Parser::parseExpression(UErrorCode& status) { Expression::Builder exprBuilder(status); // Restore precondition - if (!inBounds(source, index)) { + if (!inBounds()) { exprFallback(exprBuilder); } else { // literal '|', variable '$' or annotation - switch (source[index]) { + switch (peek()) { case PIPE: { // Quoted literal parseLiteralOrVariableWithAnnotation(false, exprBuilder, status); @@ -1458,15 +1478,15 @@ Expression Parser::parseExpression(UErrorCode& status) { break; } default: { - if (isAnnotationStart(source[index])) { + if (isAnnotationStart(peek())) { Operator rator = parseAnnotation(status); exprBuilder.setOperator(std::move(rator)); - } else if (isUnquotedStart(source[index])) { + } else if (isUnquotedStart(peek())) { // Unquoted literal parseLiteralOrVariableWithAnnotation(false, exprBuilder, status); } else { // Not a literal, variable or annotation -- error out - ERROR(parseError, status, index); + ERROR(status); exprFallback(exprBuilder); break; } @@ -1490,8 +1510,8 @@ Expression Parser::parseExpression(UErrorCode& status) { U_ASSERT(U_SUCCESS(localStatus)); // Check for end-of-input and missing '}' - if (!inBounds(source, index)) { - ERROR(parseError, status, index); + if (!inBounds()) { + ERROR(status); } else { // Otherwise, it's safe to check for the '}' parseToken(RIGHT_CURLY_BRACE, status); @@ -1506,17 +1526,17 @@ Expression Parser::parseExpression(UErrorCode& status) { void Parser::parseLocalDeclaration(UErrorCode& status) { // End-of-input here would be an error; even empty // declarations must be followed by a body - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); parseToken(ID_LOCAL, status); parseRequiredWhitespace(status); // Restore precondition - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); VariableName lhs = parseVariableName(status); parseTokenWithWhitespace(EQUALS, status); // Restore precondition before calling parseExpression() - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); Expression rhs = parseExpression(status); @@ -1554,13 +1574,13 @@ void Parser::parseLocalDeclaration(UErrorCode& status) { void Parser::parseInputDeclaration(UErrorCode& status) { // End-of-input here would be an error; even empty // declarations must be followed by a body - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); parseToken(ID_INPUT, status); parseOptionalWhitespace(status); // Restore precondition before calling parseExpression() - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); // Save the index for error diagnostics int32_t exprIndex = index; @@ -1570,7 +1590,7 @@ void Parser::parseInputDeclaration(UErrorCode& status) { if (!rhs.getOperand().isVariable()) { // This case is a syntax error; report it at the beginning // of the expression - ERROR(parseError, status, exprIndex); + ERROR_AT(status, exprIndex); return; } @@ -1597,7 +1617,7 @@ void Parser::parseInputDeclaration(UErrorCode& status) { Parses a `reserved-statement` per the grammar */ void Parser::parseUnsupportedStatement(UErrorCode& status) { - U_ASSERT(inBounds(source, index) && source[index] == PERIOD); + U_ASSERT(inBounds() && peek() == PERIOD); UnsupportedStatement::Builder builder(status); CHECK_ERROR(status); @@ -1605,7 +1625,7 @@ void Parser::parseUnsupportedStatement(UErrorCode& status) { // Parse the keyword UnicodeString keyword(PERIOD); normalizedInput += UnicodeString(PERIOD); - index++; + next(); keyword += parseName(status); builder.setKeyword(keyword); @@ -1617,18 +1637,18 @@ void Parser::parseUnsupportedStatement(UErrorCode& status) { // a '{') // * a '{' - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); - if (source[index] != LEFT_CURLY_BRACE) { - if (!isWhitespace(source[index])) { - ERROR(parseError, status, index); + if (peek() != LEFT_CURLY_BRACE) { + if (!isWhitespace(peek())) { + ERROR(status); return; } // Expect a reserved-body start int32_t savedIndex = index; parseRequiredWhitespace(status); - CHECK_BOUNDS(source, index, parseError, status); - if (isReservedBodyStart(source[index])) { + CHECK_BOUNDS(status); + if (isReservedBodyStart(peek())) { // There is a reserved body Reserved::Builder r(status); builder.setBody(parseReservedBody(r, status)); @@ -1639,8 +1659,8 @@ void Parser::parseUnsupportedStatement(UErrorCode& status) { } // Otherwise, the next character must be a '{' // to open the required expression (or optional whitespace) - if (source[index] != LEFT_CURLY_BRACE && !isWhitespace(source[index])) { - ERROR(parseError, status, index); + if (peek() != LEFT_CURLY_BRACE && !isWhitespace(peek())) { + ERROR(status); return; } } @@ -1650,12 +1670,12 @@ void Parser::parseUnsupportedStatement(UErrorCode& status) { // Need to look ahead to disambiguate a '{' beginning // an expression from one beginning with a quoted pattern int32_t expressionCount = 0; - while (source[index] == LEFT_CURLY_BRACE || isWhitespace(source[index])) { + while (peek() == LEFT_CURLY_BRACE || isWhitespace(peek())) { parseOptionalWhitespace(status); - bool nextIsLbrace = source[index] == LEFT_CURLY_BRACE; - bool nextIsQuotedPattern = nextIsLbrace && inBounds(source, index + 1) - && source[index + 1] == LEFT_CURLY_BRACE; + bool nextIsLbrace = peek() == LEFT_CURLY_BRACE; + bool nextIsQuotedPattern = nextIsLbrace && inBounds(1) + && peek(1) == LEFT_CURLY_BRACE; if (nextIsQuotedPattern) { break; } @@ -1665,7 +1685,7 @@ void Parser::parseUnsupportedStatement(UErrorCode& status) { } if (expressionCount <= 0) { // At least one expression is required - ERROR(parseError, status, index); + ERROR(status); return; } dataModel.addUnsupportedStatement(builder.build(status), status); @@ -1675,7 +1695,7 @@ void Parser::parseUnsupportedStatement(UErrorCode& status) { // and supported keywords bool Parser::nextIs(const std::u16string_view &keyword) const { for (int32_t i = 0; i < static_cast(keyword.length()); i++) { - if (!inBounds(source, index + i) || source[index + i] != keyword[i]) { + if (!inBounds(i) || peek(i) != keyword[i]) { return false; } } @@ -1691,17 +1711,17 @@ bool Parser::nextIs(const std::u16string_view &keyword) const { void Parser::parseDeclarations(UErrorCode& status) { // End-of-input here would be an error; even empty // declarations must be followed by a body - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); - while (source[index] == PERIOD) { - CHECK_BOUNDS(source, index + 1, parseError, status); + while (peek() == PERIOD) { + CHECK_BOUNDS_1(status); // Check for unsupported statements first // Lookahead is needed to disambiguate keyword from known keywords if (!nextIs(ID_MATCH) && !nextIs(ID_LOCAL) && !nextIs(ID_INPUT)) { parseUnsupportedStatement(status); - } else if (source[index + 1] == ID_LOCAL[1]) { + } else if (peek(1) == ID_LOCAL[1]) { parseLocalDeclaration(status); - } else if (source[index + 1] == ID_INPUT[1]) { + } else if (peek(1) == ID_INPUT[1]) { parseInputDeclaration(status); } else { // Done parsing declarations @@ -1713,7 +1733,7 @@ void Parser::parseDeclarations(UErrorCode& status) { parseOptionalWhitespace(status); // Restore precondition - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); } } @@ -1725,13 +1745,17 @@ void Parser::parseDeclarations(UErrorCode& status) { */ UnicodeString Parser::parseTextChar(UErrorCode& status) { UnicodeString str; - if (!inBounds(source, index) || !(isTextChar(source[index]))) { + if (!inBounds() || !(isTextChar(peek()))) { // Error -- text-char is expected here - ERROR(parseError, status, index); + ERROR(status); } else { - normalizedInput += source[index]; - str += source[index]; - index++; + // See comment in parseQuotedLiteral() + if (isEscapableChar(peek())) { + normalizedInput += BACKSLASH; + } + normalizedInput += peek(); + str += peek(); + next(); maybeAdvanceLine(); } return str; @@ -1742,17 +1766,17 @@ UnicodeString Parser::parseTextChar(UErrorCode& status) { the `key` nonterminal in the grammar */ Key Parser::parseKey(UErrorCode& status) { - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); Key k; // wildcard by default // Literal | '*' - switch (source[index]) { + switch (peek()) { case ASTERISK: { - index++; + next(); normalizedInput += ASTERISK; // Guarantee postcondition - if (!inBounds(source, index)) { - ERROR(parseError, status, index); + if (!inBounds()) { + ERROR(status); return k; } break; @@ -1778,7 +1802,7 @@ SelectorKeys Parser::parseNonEmptyKeys(UErrorCode& status) { return result; } - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); /* Arbitrary lookahead is required to parse key lists. To see why, consider @@ -1810,31 +1834,31 @@ This is addressed using "backtracking" (similarly to `parseOptions()`). keysBuilder.add(parseKey(status), status); // Restore precondition - if (!inBounds(source, index)) { - ERROR(parseError, status, index); + if (!inBounds()) { + ERROR(status); return result; } // We've seen at least one whitespace-key pair, so now we can parse // *(s key) [s] - while (source[index] != LEFT_CURLY_BRACE || isWhitespace(source[index])) { // Try to recover from errors - bool wasWhitespace = isWhitespace(source[index]); + while (peek() != LEFT_CURLY_BRACE || isWhitespace(peek())) { // Try to recover from errors + bool wasWhitespace = isWhitespace(peek()); parseRequiredWhitespace(status); if (!wasWhitespace) { // Avoid infinite loop when parsing something like: // when * @{!... - index++; + next(); } // Restore precondition - if (!inBounds(source, index)) { - ERROR(parseError, status, index); + if (!inBounds()) { + ERROR(status); return result; } // At this point, it's ambiguous whether we are inside (s key) or [s]. // This check resolves that ambiguity. - if (source[index] == LEFT_CURLY_BRACE) { + if (peek() == LEFT_CURLY_BRACE) { // A pattern follows, so what we just parsed was the optional // trailing whitespace. All the keys have been parsed. @@ -1849,7 +1873,7 @@ This is addressed using "backtracking" (similarly to `parseOptions()`). } Pattern Parser::parseQuotedPattern(UErrorCode& status) { - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); parseToken(LEFT_CURLY_BRACE, status); parseToken(LEFT_CURLY_BRACE, status); @@ -1864,9 +1888,9 @@ Pattern Parser::parseQuotedPattern(UErrorCode& status) { No postcondition (a markup can end a message) */ Markup Parser::parseMarkup(UErrorCode& status) { - U_ASSERT(inBounds(source, index + 1)); + U_ASSERT(inBounds(1)); - U_ASSERT(source[index] == LEFT_CURLY_BRACE); + U_ASSERT(peek() == LEFT_CURLY_BRACE); Markup::Builder builder(status); if (U_FAILURE(status)) { @@ -1874,26 +1898,26 @@ Markup Parser::parseMarkup(UErrorCode& status) { } // Consume the '{' - index++; + next(); normalizedInput += LEFT_CURLY_BRACE; parseOptionalWhitespace(status); bool closing = false; - switch (source[index]) { + switch (peek()) { case NUMBER_SIGN: { // Open or standalone; consume the '#' - normalizedInput += source[index]; - index++; + normalizedInput += peek(); + next(); break; } case SLASH: { // Closing - normalizedInput += source[index]; + normalizedInput += peek(); closing = true; - index++; + next(); break; } default: { - ERROR(parseError, status, index); + ERROR(status); return {}; } } @@ -1903,14 +1927,14 @@ Markup Parser::parseMarkup(UErrorCode& status) { // Parse the options, which must begin with a ' ' // if present - if (inBounds(source, index) && isWhitespace(source[index])) { + if (inBounds() && isWhitespace(peek())) { OptionAdder optionAdder(builder); parseOptions(optionAdder, status); } // Parse the attributes, which also must begin // with a ' ' - if (inBounds(source, index) && isWhitespace(source[index])) { + if (inBounds() && isWhitespace(peek())) { AttributeAdder attrAdder(builder); parseAttributes(attrAdder, status); } @@ -1920,10 +1944,10 @@ Markup Parser::parseMarkup(UErrorCode& status) { bool standalone = false; // Check if this is a standalone or not if (!closing) { - if (inBounds(source, index) && source[index] == SLASH) { + if (inBounds() && peek() == SLASH) { standalone = true; normalizedInput += SLASH; - index++; + next(); } } @@ -1945,23 +1969,25 @@ Markup Parser::parseMarkup(UErrorCode& status) { No postcondition (a placeholder can end a message) */ std::variant Parser::parsePlaceholder(UErrorCode& status) { - U_ASSERT(source[index] == LEFT_CURLY_BRACE); + U_ASSERT(peek() == LEFT_CURLY_BRACE); - if (!inBounds(source, index)) { - ERROR(parseError, status, index); + if (!inBounds()) { + ERROR(status); return exprFallback(status); } - // Need to look ahead arbitrarily since can appear before the '{' and '#' + // Need to look ahead arbitrarily since whitespace + // can appear before the '{' and '#' // in markup - int32_t tempIndex = index + 1; + int32_t tempIndex = 1; bool isMarkup = false; - while (inBounds(source, tempIndex)) { - if (source[tempIndex] == NUMBER_SIGN || source[tempIndex] == SLASH) { + while (inBounds(1)) { + UChar32 c = peek(tempIndex); + if (c == NUMBER_SIGN || c == SLASH) { isMarkup = true; break; } - if (!isWhitespace(source[tempIndex])){ + if (!isWhitespace(c)){ break; } tempIndex++; @@ -1975,7 +2001,7 @@ std::variant Parser::parsePlaceholder(UErrorCode& status) { /* Consume a `simple-message`, matching the nonterminal in the grammar - Postcondition: `index == source.length()` or U_FAILURE(status); + Postcondition: `index == len()` or U_FAILURE(status); for a syntactically correct message, this will consume the entire input */ Pattern Parser::parseSimpleMessage(UErrorCode& status) { @@ -1983,8 +2009,8 @@ Pattern Parser::parseSimpleMessage(UErrorCode& status) { if (U_SUCCESS(status)) { Expression expression; - while (inBounds(source, index)) { - switch (source[index]) { + while (inBounds()) { + switch (peek()) { case LEFT_CURLY_BRACE: { // Must be placeholder std::variant piece = parsePlaceholder(status); @@ -2012,7 +2038,7 @@ Pattern Parser::parseSimpleMessage(UErrorCode& status) { break; } } - if (source[index] == RIGHT_CURLY_BRACE) { + if (peek() == RIGHT_CURLY_BRACE) { // End of quoted pattern break; } @@ -2030,13 +2056,13 @@ Pattern Parser::parseSimpleMessage(UErrorCode& status) { Consume a `selectors` (matching the nonterminal in the grammar), followed by a non-empty sequence of `variant`s (matching the nonterminal in the grammar) preceded by whitespace - No postcondition (on return, `index` might equal `source.length()` with no syntax error + No postcondition (on return, `index` might equal `len()` with no syntax error because a message can end with a variant) */ void Parser::parseSelectors(UErrorCode& status) { CHECK_ERROR(status); - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); parseToken(ID_MATCH, status); @@ -2044,11 +2070,11 @@ void Parser::parseSelectors(UErrorCode& status) { // Parse selectors // "Backtracking" is required here. It's not clear if whitespace is // (`[s]` selector) or (`[s]` variant) - while (isWhitespace(source[index]) || source[index] == LEFT_CURLY_BRACE) { + while (isWhitespace(peek()) || peek() == LEFT_CURLY_BRACE) { parseOptionalWhitespace(status); // Restore precondition - CHECK_BOUNDS(source, index, parseError, status); - if (source[index] != LEFT_CURLY_BRACE) { + CHECK_BOUNDS(status); + if (peek() != LEFT_CURLY_BRACE) { // This is not necessarily an error, but rather, // means the whitespace we parsed was the optional // whitespace preceding the first variant, not the @@ -2065,29 +2091,21 @@ void Parser::parseSelectors(UErrorCode& status) { // At least one selector is required if (empty) { - ERROR(parseError, status, index); + ERROR(status); return; } #define CHECK_END_OF_INPUT \ - if (((int32_t)index) >= source.length()) { \ + if (!inBounds()) { \ break; \ } \ // Parse variants - while (isWhitespace(source[index]) || isKeyStart(source[index])) { - if (isWhitespace(source[index])) { - int32_t whitespaceStart = index; - parseOptionalWhitespace(status); - // Restore the precondition. - // Error out if we reached the end of input. The message - // cannot end with trailing whitespace if there are variants. - if (!inBounds(source, index)) { - // Use index of first whitespace for error message - index = whitespaceStart; - ERROR(parseError, status, index); - return; - } + while (isWhitespace(peek()) || isKeyStart(peek())) { + // Trailing whitespace is allowed + parseOptionalWhitespace(status); + if (!inBounds()) { + return; } // At least one key is required @@ -2100,7 +2118,7 @@ void Parser::parseSelectors(UErrorCode& status) { // Restore precondition before calling parsePattern() // (which must return a non-null value) - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); Pattern rhs = parseQuotedPattern(status); dataModel.addVariant(std::move(keyList), std::move(rhs), status); @@ -2117,7 +2135,7 @@ void Parser::parseSelectors(UErrorCode& status) { /* Consume a `body` (matching the nonterminal in the grammar), - No postcondition (on return, `index` might equal `source.length()` with no syntax error, + No postcondition (on return, `index` might equal `len()` with no syntax error, because a message can end with a body (trailing whitespace is optional) */ @@ -2136,8 +2154,9 @@ void Parser::errorPattern(UErrorCode& status) { whether this is the intent behind the spec */ UnicodeString partStr(LEFT_CURLY_BRACE); - while (inBounds(source, index)) { - partStr += source[index++]; + while (inBounds()) { + partStr += peek(); + next(); } // Add curly braces around the entire output (same comment as above) partStr += RIGHT_CURLY_BRACE; @@ -2149,13 +2168,13 @@ void Parser::parseBody(UErrorCode& status) { CHECK_ERROR(status); // Out-of-input is a syntax warning - if (!inBounds(source, index)) { + if (!inBounds()) { errorPattern(status); return; } // Body must be either a pattern or selectors - switch (source[index]) { + switch (peek()) { case LEFT_CURLY_BRACE: { // Pattern dataModel.setPattern(parseQuotedPattern(status)); @@ -2167,7 +2186,7 @@ void Parser::parseBody(UErrorCode& status) { return; } default: { - ERROR(parseError, status, index); + ERROR(status); errorPattern(status); return; } @@ -2183,14 +2202,17 @@ void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) { bool complex = false; // First, "look ahead" to determine if this is a simple or complex // message. To do that, check the first non-whitespace character. - while (inBounds(source, index) && isWhitespace(source[index])) { - index++; - } - if (inBounds(source, index)) { - if (source[index] == PERIOD - || (index < static_cast(source.length()) + 1 - && source[index] == LEFT_CURLY_BRACE - && source[index + 1] == LEFT_CURLY_BRACE)) { + while (inBounds(index) && isWhitespace(peek())) { + next(); + } + + // Message can be empty, so we need to only look ahead + // if we know it's non-empty + if (inBounds()) { + if (peek() == PERIOD + || (inBounds(1) + && peek() == LEFT_CURLY_BRACE + && peek(1) == LEFT_CURLY_BRACE)) { complex = true; } } @@ -2217,8 +2239,8 @@ void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) { CHECK_ERROR(status); // There are no errors; finally, check that the entire input was consumed - if (static_cast(index) != source.length()) { - ERROR(parseError, status, index); + if (!allConsumed()) { + ERROR(status); } // Finally, copy the relevant fields of the internal `MessageParseError` diff --git a/icu4c/source/i18n/messageformat2_parser.h b/icu4c/source/i18n/messageformat2_parser.h index 9367c0e981dd..4b35760cb118 100644 --- a/icu4c/source/i18n/messageformat2_parser.h +++ b/icu4c/source/i18n/messageformat2_parser.h @@ -140,10 +140,21 @@ namespace message2 { SelectorKeys parseNonEmptyKeys(UErrorCode&); void errorPattern(UErrorCode& status); Pattern parseQuotedPattern(UErrorCode&); + bool isDeclarationStart(); + + UChar32 peek() const { return source.char32At(index) ; } + UChar32 peek(uint32_t i) const { + return source.char32At(source.moveIndex32(index, i)); + } + void next() { index = source.moveIndex32(index, 1); } + + bool inBounds() const { return (int32_t) index < source.length(); } + bool inBounds(uint32_t i) const { return source.moveIndex32(index, i) < source.length(); } + bool allConsumed() const { return (int32_t) index == source.length(); } // The input string const UnicodeString &source; - // The current position within the input string + // The current position within the input string -- counting in UChar32 uint32_t index; // Represents the current line (and when an error is indicated), // character offset within the line of the parse error diff --git a/icu4c/source/i18n/messageformat2_serializer.cpp b/icu4c/source/i18n/messageformat2_serializer.cpp index 82fa65859419..228cf34181cd 100644 --- a/icu4c/source/i18n/messageformat2_serializer.cpp +++ b/icu4c/source/i18n/messageformat2_serializer.cpp @@ -42,24 +42,26 @@ void Serializer::emit(const std::u16string_view& token) { void Serializer::emit(const Literal& l) { if (l.isQuoted()) { emit(PIPE); - const UnicodeString& contents = l.unquoted(); - for (int32_t i = 0; i < contents.length(); i++) { - // Re-escape any PIPE or BACKSLASH characters + } + const UnicodeString& contents = l.unquoted(); + for (int32_t i = 0; ((int32_t) i) < contents.length(); i++) { + // Re-escape any escaped-char characters switch(contents[i]) { case BACKSLASH: - case PIPE: { - emit(BACKSLASH); - break; + case PIPE: + case LEFT_CURLY_BRACE: + case RIGHT_CURLY_BRACE: { + emit(BACKSLASH); + break; } default: { - break; + break; } } emit(contents[i]); - } - emit(PIPE); - } else { - emit(l.unquoted()); + } + if (l.isQuoted()) { + emit(PIPE); } } @@ -194,9 +196,10 @@ void Serializer::emit(const PatternPart& part) { if (part.isText()) { // Raw text const UnicodeString& text = part.asText(); - // Re-escape '{'/'}'/'\' - for (int32_t i = 0; i < text.length(); i++) { + // Re-escape '{'/'}'/'\''|' + for (int32_t i = 0; ((int32_t) i) < text.length(); i++) { switch(text[i]) { + case PIPE: case BACKSLASH: case LEFT_CURLY_BRACE: case RIGHT_CURLY_BRACE: { diff --git a/icu4c/source/test/intltest/messageformat2test_read_json.cpp b/icu4c/source/test/intltest/messageformat2test_read_json.cpp index 0c65764e7fb5..53d25ebe715c 100644 --- a/icu4c/source/test/intltest/messageformat2test_read_json.cpp +++ b/icu4c/source/test/intltest/messageformat2test_read_json.cpp @@ -303,6 +303,8 @@ void TestMessageFormat2::jsonTestsFromFiles(IcuTestErrorCode& errorCode) { // Do tests for reserved statements/expressions runTestsFromJsonFile(*this, "spec/unsupported-expressions.json", errorCode); runTestsFromJsonFile(*this, "spec/unsupported-statements.json", errorCode); + // Uncomment when reserved syntax is removed + // runTestsFromJsonFile(*this, "syntax-errors-reserved.json", errorCode); // Do valid spec tests runTestsFromJsonFile(*this, "spec/syntax.json", errorCode); diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/message2/MFParser.java b/icu4j/main/core/src/main/java/com/ibm/icu/message2/MFParser.java index 8ba50d055c37..61d18ffde8a4 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/message2/MFParser.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/message2/MFParser.java @@ -613,10 +613,8 @@ private MFDataModel.Variant getVariant() throws MFParseException { } keys.add(key); } - // Only want to skip whitespace if we parsed at least one key - if (!keys.isEmpty()) { - skipOptionalWhitespaces(); - } + // Trailing whitespace is allowed after the message + skipOptionalWhitespaces(); if (input.atEnd()) { checkCondition( keys.isEmpty(), "After selector keys it is mandatory to have a pattern."); diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/message2/CoreTest.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/message2/CoreTest.java index a0ec7c2fe3d1..dd2698087be8 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/message2/CoreTest.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/message2/CoreTest.java @@ -40,6 +40,7 @@ public class CoreTest extends CoreTestFmwk { "syntax-errors-diagnostics.json", "syntax-errors-diagnostics-multiline.json", "syntax-errors-end-of-input.json", + "syntax-errors-reserved.json", "tricky-declarations.json", "valid-tests.json"}; diff --git a/testdata/message2/syntax-errors-diagnostics.json b/testdata/message2/syntax-errors-diagnostics.json index 5b68ae80f31b..79a29a027f9b 100644 --- a/testdata/message2/syntax-errors-diagnostics.json +++ b/testdata/message2/syntax-errors-diagnostics.json @@ -136,11 +136,6 @@ "char": 28, "comment": "Trailing characters that are not whitespace" }, - { - "src": ".match {$foo :string} {$bar :string} one * {{one}} * * {{other}} ", - "char": 66, - "comment": "Trailing whitespace at end of message should not be accepted either" - }, { "src": "empty { }", "char": 8, @@ -343,6 +338,10 @@ "src": ".match {1} {{_}}", "char": 12, "comment": "Disambiguating a wrong .match from an unsupported statement" + }, + { + "src": "{{{/p o4.􍅲 = 1}}}", + "char": 9 } ] } diff --git a/testdata/message2/syntax-errors-reserved.json b/testdata/message2/syntax-errors-reserved.json new file mode 100644 index 000000000000..e82421485d23 --- /dev/null +++ b/testdata/message2/syntax-errors-reserved.json @@ -0,0 +1,22 @@ +{ + "scenario": "Reserved and private annotations", + "description": "Tests for old unsupported expression (reserved/private) syntax -- these are now syntax errors", + "defaultTestProperties": { + "locale": "en-US", + "expErrors": [ + { + "type": "syntax-error" + } + ] + }, + "tests": [ + { "src" : "\\\\{ ? 󗟋 𫓜|@} |}" }, + { "src" : "\\\\{ ? 󗟋 𫓜|@} | \\} @𠟅򑚎𥪙𽧫=|| @򒘒򳷦㥞򉊷=$򸚶񽱆񅗽񤕞 @𰺱:񎫛񢶛򶈎񄮒}" }, + { "src" : "{ $iFN ^ @USh =$u @l}" }, + { "src" : ".local $dS ={ $p4 ^ |.| \\\\ @g:FV = $kd}{{@}}" }, + { "src" : ".D. \\\\ ||{1}{{}} " }, + { "src" : ".cIT ||{|@| % \\} } { *󔜫񥘃󸇀 }{{}}" }, + { "src" : ".򱋿󠆿 . |@| {$󛒁񓝖 & \\{ . @𯼼򋼡= $򵀀񓞈}{{\\\\}}" } + ] +} + diff --git a/testdata/message2/valid-tests.json b/testdata/message2/valid-tests.json index 5f1292e00896..4b1ce8d67fb6 100644 --- a/testdata/message2/valid-tests.json +++ b/testdata/message2/valid-tests.json @@ -123,6 +123,34 @@ "comment": "message -> complex-message -> *(declaration [s]) complex-body -> declaration complex-body -> input-declaration complex-body -> input variable-expression complex-body", "src": ".input{$x}{{}}", "exp": "" + }, + { + "comment": "Test that escaped characters are re-escaped properly when serializing literals", + "src": "{|\\}|}", + "exp": "}" + }, + { + "comment": "Test that escaped characters are re-escaped properly when serializing patterns", + "src": "\\|", + "exp": "|" + }, + { + "comment": "Test that parser and serializer treat optionally-escaped characters consistently", + "src" : "{{{1}|}}", + "exp": "1|" + }, + { + "comment": "Trailing whitespace after match is valid", + "src": ".match {1 :string} * {{}} ", + "exp": "" + }, + { + "src" : "𴢸", + "exp" : "𴢸" + }, + { + "src" : "{{􍅲}}", + "exp" : "􍅲" } ] }