From d9742058466bce4b2261b5d550fdd2a09db79211 Mon Sep 17 00:00:00 2001 From: Sebastien Ros Date: Fri, 6 Jun 2025 11:13:09 -0700 Subject: [PATCH] Skip incomplete parts of numbers --- src/Parlot/Scanner.cs | 1492 +++++++++++++++-------------- test/Parlot.Tests/ScannerTests.cs | 51 +- 2 files changed, 779 insertions(+), 764 deletions(-) diff --git a/src/Parlot/Scanner.cs b/src/Parlot/Scanner.cs index 5432ca9..b22fa31 100644 --- a/src/Parlot/Scanner.cs +++ b/src/Parlot/Scanner.cs @@ -1,741 +1,757 @@ -using System; - -using Parlot.Fluent; - -using System.Linq; - -#if NET8_0_OR_GREATER -using System.Buffers; -#endif -using System.Runtime.CompilerServices; - -namespace Parlot; - -/// -/// This class is used to return tokens extracted from the input buffer. -/// -public class Scanner -{ - public readonly string Buffer; - public readonly Cursor Cursor; - - /// - /// Scans some text. - /// - /// The string containing the text to scan. - public Scanner(string buffer) - { - Buffer = buffer ?? throw new ArgumentNullException(nameof(buffer)); - Cursor = new Cursor(Buffer, TextPosition.Start); - } - - /// - /// Reads any whitespace without generating a token. - /// - /// Whether some white space was read. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool SkipWhiteSpaceOrNewLine() - { - if (!Character.IsWhiteSpaceOrNewLine(Cursor.Current)) - { - return false; - } - - var span = Cursor.Span; - var length = span.Length; - - for (var i = 1; i < length; i++) - { - var c = span[i]; - - if (!Character.IsWhiteSpaceOrNewLine(c)) - { - Cursor.Advance(i); - return true; - } - } - - Cursor.Advance(span.Length); - return true; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool SkipWhiteSpace() - { - if (!Character.IsWhiteSpace(Cursor.Current)) - { - return false; - } - - var span = Cursor.Span; - var length = span.Length; - - for (var i = 1; i < length; i++) - { - var c = span[i]; - - if (!Character.IsWhiteSpace(c)) - { - if (i > 0) - { - Cursor.AdvanceNoNewLines(i); - return true; - } - - return false; - } - } - - Cursor.AdvanceNoNewLines(span.Length); - return true; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadFirstThenOthers(Func first, Func other) - => ReadFirstThenOthers(first, other, out _); - - public bool ReadFirstThenOthers(Func first, Func other, out ReadOnlySpan result) - { - if (!first(Cursor.Current)) - { - result = []; - return false; - } - - var start = Cursor.Offset; - - // At this point we have an identifier, read while it's an identifier part. - - Cursor.Advance(); - - ReadWhile(other, out _); - - result = Buffer.AsSpan(start, Cursor.Offset - start); - - return true; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadIdentifier() => ReadIdentifier(out _); - - public bool ReadIdentifier(out ReadOnlySpan result) - { - // perf: using Character.IsIdentifierStart instead of x => Character.IsIdentifierStart(x) induces some allocations - - return ReadFirstThenOthers(static x => Character.IsIdentifierStart(x), static x => Character.IsIdentifierPart(x), out result); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadDecimal() => ReadDecimal(out _); - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadDecimal(out ReadOnlySpan number) => ReadDecimal(true, true, false, true, out number); - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadDecimal(NumberOptions numberOptions, out ReadOnlySpan number, char decimalSeparator = '.', char groupSeparator = ',') - { - return ReadDecimal( - (numberOptions & NumberOptions.AllowLeadingSign) != 0, - (numberOptions & NumberOptions.AllowDecimalSeparator) != 0, - (numberOptions & NumberOptions.AllowGroupSeparators) != 0, - (numberOptions & NumberOptions.AllowExponent) != 0, - out number, - decimalSeparator, - groupSeparator); - } - - public bool ReadDecimal(bool allowLeadingSign, bool allowDecimalSeparator, bool allowGroupSeparator, bool allowExponent, out ReadOnlySpan number, char decimalSeparator = '.', char groupSeparator = ',') - { - var start = Cursor.Position; - - if (allowLeadingSign) - { - if (Cursor.Current is '-' or '+') - { - Cursor.AdvanceNoNewLines(1); - } - } - - if (!ReadInteger(out number)) - { - // If there is no number, check if the decimal separator is allowed and present, otherwise fail - - if (!allowDecimalSeparator || Cursor.Current != decimalSeparator) - { - Cursor.ResetPosition(start); - return false; - } - } - - // Number can be empty if we have a decimal separator directly, in this case don't expect group separators - if (!number.IsEmpty && allowGroupSeparator && Cursor.Current == groupSeparator) - { - var savedCursor = Cursor.Position; - // Group separators can be repeated as many times - while (true) - { - if (Cursor.Current == groupSeparator) - { - Cursor.AdvanceNoNewLines(1); - } - else if (!ReadInteger()) - { - // it was not a group separator, really, so go back where the symbol was and stop - Cursor.ResetPosition(savedCursor); - break; +using System; +using Parlot.Fluent; +using System.Linq; + +#if NET8_0_OR_GREATER +using System.Buffers; +#endif +using System.Runtime.CompilerServices; + +namespace Parlot; + +/// +/// This class is used to return tokens extracted from the input buffer. +/// +public class Scanner +{ + public readonly string Buffer; + public readonly Cursor Cursor; + + /// + /// Scans some text. + /// + /// The string containing the text to scan. + public Scanner(string buffer) + { + Buffer = buffer ?? throw new ArgumentNullException(nameof(buffer)); + Cursor = new Cursor(Buffer, TextPosition.Start); + } + + /// + /// Reads any whitespace without generating a token. + /// + /// Whether some white space was read. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool SkipWhiteSpaceOrNewLine() + { + if (!Character.IsWhiteSpaceOrNewLine(Cursor.Current)) + { + return false; + } + + var span = Cursor.Span; + var length = span.Length; + + for (var i = 1; i < length; i++) + { + var c = span[i]; + + if (!Character.IsWhiteSpaceOrNewLine(c)) + { + Cursor.Advance(i); + return true; + } + } + + Cursor.Advance(span.Length); + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool SkipWhiteSpace() + { + if (!Character.IsWhiteSpace(Cursor.Current)) + { + return false; + } + + var span = Cursor.Span; + var length = span.Length; + + for (var i = 1; i < length; i++) + { + var c = span[i]; + + if (!Character.IsWhiteSpace(c)) + { + if (i > 0) + { + Cursor.AdvanceNoNewLines(i); + return true; + } + + return false; + } + } + + Cursor.AdvanceNoNewLines(span.Length); + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadFirstThenOthers(Func first, Func other) + => ReadFirstThenOthers(first, other, out _); + + public bool ReadFirstThenOthers(Func first, Func other, out ReadOnlySpan result) + { + if (!first(Cursor.Current)) + { + result = []; + return false; + } + + var start = Cursor.Offset; + + // At this point we have an identifier, read while it's an identifier part. + + Cursor.Advance(); + + ReadWhile(other, out _); + + result = Buffer.AsSpan(start, Cursor.Offset - start); + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadIdentifier() => ReadIdentifier(out _); + + public bool ReadIdentifier(out ReadOnlySpan result) + { + // perf: using Character.IsIdentifierStart instead of x => Character.IsIdentifierStart(x) induces some allocations + + return ReadFirstThenOthers(static x => Character.IsIdentifierStart(x), static x => Character.IsIdentifierPart(x), out result); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadDecimal() => ReadDecimal(out _); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadDecimal(out ReadOnlySpan number) => ReadDecimal(true, true, false, true, out number); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadDecimal(NumberOptions numberOptions, out ReadOnlySpan number, char decimalSeparator = '.', char groupSeparator = ',') + { + return ReadDecimal( + (numberOptions & NumberOptions.AllowLeadingSign) != 0, + (numberOptions & NumberOptions.AllowDecimalSeparator) != 0, + (numberOptions & NumberOptions.AllowGroupSeparators) != 0, + (numberOptions & NumberOptions.AllowExponent) != 0, + out number, + decimalSeparator, + groupSeparator); + } + + public bool ReadDecimal(bool allowLeadingSign, bool allowDecimalSeparator, bool allowGroupSeparator, bool allowExponent, out ReadOnlySpan number, char decimalSeparator = '.', char groupSeparator = ',') + { + // The buffer is read while the value is a valid decimal number. For instance `123,a` will return `123`. + + var start = Cursor.Position; + + if (allowLeadingSign) + { + if (Cursor.Current is '-' or '+') + { + Cursor.AdvanceNoNewLines(1); + } + } + + if (!ReadInteger(out number)) + { + // If there is no number, check if the decimal separator is allowed and present, otherwise fail + if (!allowDecimalSeparator || Cursor.Current != decimalSeparator) + { + Cursor.ResetPosition(start); + return false; + } + } + + // Number can be empty if we have a decimal separator directly, in this case don't expect group separators + if (!number.IsEmpty && allowGroupSeparator && Cursor.Current == groupSeparator) + { + var beforeGroupPosition = Cursor.Position; + + // Group separators can be repeated as many times + while (true) + { + if (Cursor.Current == groupSeparator) + { + Cursor.AdvanceNoNewLines(1); + } + else if (!ReadInteger()) + { + // it was not a group separator so go back where the symbol was and stop + Cursor.ResetPosition(beforeGroupPosition); + break; } else { - savedCursor = Cursor.Position; - } - } - } - - if (allowDecimalSeparator) - { - if (Cursor.Current == decimalSeparator) - { - Cursor.AdvanceNoNewLines(1); - - ReadInteger(out number); - } - } - - if (allowExponent && (Cursor.Current is 'e' or 'E')) - { - Cursor.AdvanceNoNewLines(1); - - if (Cursor.Current is '-' or '+') - { - Cursor.AdvanceNoNewLines(1); - } - - // The exponent must be followed by a number, without a group separator - if (!ReadInteger(out _)) - { - Cursor.ResetPosition(start); - return false; - } - } - - number = Cursor.Buffer.AsSpan(start.Offset, Cursor.Offset - start.Offset); - return true; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadInteger() => ReadInteger(out _); - -#if NET8_0_OR_GREATER - public bool ReadInteger(out ReadOnlySpan result) - { - var span = Cursor.Span; - - var noDigitIndex = span.IndexOfAnyExcept(Character._decimalDigits); - - // If first char is not a digit, fail - if (noDigitIndex == 0 || span.IsEmpty) - { - result = []; - return false; - } - - // If all chars are digits - if (noDigitIndex == -1) - { - result = span; - } - else - { - result = span[..noDigitIndex]; - } - - Cursor.AdvanceNoNewLines(result.Length); - - return true; - } -#else - public bool ReadInteger(out ReadOnlySpan result) - { - var next = 0; - while (Character.IsDecimalDigit(Cursor.PeekNext(next))) - { - next += 1; - } - - // Not digit was read - if (next == 0) - { - result = []; - return false; - } - - Cursor.AdvanceNoNewLines(next); - result = Buffer.AsSpan(Cursor.Offset - next, next); - - return true; - } -#endif - - /// - /// Reads a token while the specific predicate is valid. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadWhile(Func predicate) => ReadWhile(predicate, out _); - - /// - /// Reads a token while the specific predicate is valid. - /// - public bool ReadWhile(Func predicate, out ReadOnlySpan result) - { - if (Cursor.Eof || !predicate(Cursor.Current)) - { - result = []; - return false; - } - - var start = Cursor.Offset; - - Cursor.Advance(); - - while (!Cursor.Eof && predicate(Cursor.Current)) - { - Cursor.Advance(); - } - - result = Buffer.AsSpan(start, Cursor.Offset - start); - - return true; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadNonWhiteSpace() => ReadNonWhiteSpace(out _); - - public bool ReadNonWhiteSpace(out ReadOnlySpan result) - { - return ReadWhile(static x => !Character.IsWhiteSpace(x), out result); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadNonWhiteSpaceOrNewLine() => ReadNonWhiteSpaceOrNewLine(out _); - - public bool ReadNonWhiteSpaceOrNewLine(out ReadOnlySpan result) - { - return ReadWhile(static x => !Character.IsWhiteSpaceOrNewLine(x), out result); - } - - /// - /// Reads the specified text. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadChar(char c) - { - if (!Cursor.Match(c)) - { - return false; - } - - Cursor.Advance(); - return true; - } - - /// - /// Reads the specified text. - /// - public bool ReadChar(char c, out ReadOnlySpan result) - { - if (!Cursor.Match(c)) - { - result = []; - return false; - } - - var start = Cursor.Offset; - Cursor.Advance(); - - result = Buffer.AsSpan(start, Cursor.Offset - start); - return true; - } - - /// - /// Reads the specific expected text. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadText(ReadOnlySpan text, StringComparison comparisonType) => ReadText(text, comparisonType, out _); - - /// - /// Reads the specific expected text. - /// - public bool ReadText(ReadOnlySpan text, StringComparison comparisonType, out ReadOnlySpan result) - { - if (!Cursor.Match(text, comparisonType)) - { - result = []; - return false; - } - - var start = Cursor.Offset; - Cursor.Advance(text.Length); - result = Buffer.AsSpan(start, Cursor.Offset - start); - - return true; - } - - /// - /// Reads the specific expected chars. - /// - [Obsolete("Prefer bool ReadAnyOf(ReadOnlySpan, out ReadOnlySpan)")] - public bool ReadAnyOf(ReadOnlySpan chars, StringComparison comparisonType, out ReadOnlySpan result) - { - var current = Cursor.Buffer.AsSpan(Cursor.Offset, 1); - - var index = chars.IndexOf(current, comparisonType); - - if (index == -1) - { - result = []; - return false; - } - - var start = Cursor.Offset; - Cursor.Advance(index + 1); - result = Cursor.Buffer.AsSpan(start, index + 1); - - return true; - } - - /// - /// Reads the specific expected chars. - /// - public bool ReadAnyOf(ReadOnlySpan chars, out ReadOnlySpan result) - { - var start = Cursor.Offset; - - while (true) - { - var current = Cursor.Current; - var index = chars.IndexOf(current); - - if (index == -1) - { - if (Cursor.Offset == start) - { - result = []; - return false; - } - - var length = Cursor.Offset - start; - - result = Cursor.Buffer.AsSpan(start, length); - return true; - } - - if (Cursor.Eof) - { - result = []; - return false; - } - - Cursor.Advance(1); - } - } - -#if NET8_0_OR_GREATER - /// - /// Reads the specific expected chars. - /// - /// - /// This overload uses as this shouldn't be created on every call. The actual implementation of - /// is chosen based on the constituents of the list. The caller should thus reuse the instance. - /// - public bool ReadAnyOf(SearchValues values, out ReadOnlySpan result) - { - var span = Cursor.Span; - - var notInRangeIndex = span.IndexOfAnyExcept(values); - - // If first char is not in range - if (notInRangeIndex == 0 || span.IsEmpty) - { - result = []; - return false; - } - - // All chars match - if (notInRangeIndex == -1) - { - result = span; - } - else - { - result = span[..notInRangeIndex]; - } - - Cursor.Advance(result.Length); - - return true; - } -#endif - - /// - /// Reads the specific expected text. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadText(ReadOnlySpan text) => ReadText(text, out _); - - /// - /// Reads the specific expected text. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadText(ReadOnlySpan text, out ReadOnlySpan result) => ReadText(text, comparisonType: StringComparison.Ordinal, out result); - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadSingleQuotedString() => ReadSingleQuotedString(out _); - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadSingleQuotedString(out ReadOnlySpan result) - { - return ReadQuotedString('\'', out result); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadDoubleQuotedString() => ReadDoubleQuotedString(out _); - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadDoubleQuotedString(out ReadOnlySpan result) - { - return ReadQuotedString('\"', out result); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadBacktickString() => ReadBacktickString(out _); - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadBacktickString(out ReadOnlySpan result) - { - return ReadQuotedString('`', out result); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadQuotedString() => ReadQuotedString(out _); - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadQuotedString(char[] quoteChar) => ReadQuotedString(quoteChar, out _); - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool ReadQuotedString(char[] quoteChar, out ReadOnlySpan result) - { - var startChar = Cursor.Current; - - if (!quoteChar.Contains( startChar )) - { - result = []; - return false; - } - - return ReadQuotedString(startChar, out result); - } - - public bool ReadQuotedString(out ReadOnlySpan result) => ReadQuotedString(['\'', '\"'],out result); - - /// - /// Reads a string token enclosed in quotes or custom characters. - /// - /// - /// This method doesn't escape the string, but only validates its content is syntactically correct. - /// The resulting Span contains the original quotes. - /// - public bool ReadQuotedString(char quoteChar, out ReadOnlySpan result) - { - var startChar = Cursor.Current; - var start = Cursor.Position; - - if (startChar != quoteChar) - { - result = []; - return false; - } - - var nextQuote = Cursor.Span.Slice(1).IndexOf(startChar); - - if (nextQuote == -1) - { - // There is no end quote, not a string - result = []; - return false; - } - - var nextEscape = Cursor.Span.IndexOf('\\'); - - // If the next escape is not before the next quote, we can return the string as-is - if (nextEscape == -1 || nextEscape > nextQuote) - { - Cursor.Advance(nextQuote + 2); // include start quote - - result = Cursor.Buffer.AsSpan().Slice(start.Offset, nextQuote + 2); - return true; - } - - while (nextEscape != -1) - { - Cursor.Advance(nextEscape); - - // We can read Eof if there is an escaped quote sequence and no actual end quote, e.g. "'abc\'def" - if (Cursor.Eof) - { - Cursor.ResetPosition(start); - - result = []; - return false; - } - - if (Cursor.Match('\\')) - { - Cursor.Advance(); - - switch (Cursor.Current) - { - case '0': - case '\\': - case 'a': - case 'b': - case 'f': - case 'n': - case 'r': - case 't': - case 'v': - case '\'': - case '"': - Cursor.Advance(); - break; - - case 'u': - - // https://stackoverflow.com/a/32175520/142772 - // exactly 4 digits -#if NET8_0_OR_GREATER - var allHexDigits = Cursor.Span.Length > 4 && Cursor.Span.Slice(1, 4).IndexOfAnyExcept(Character._hexDigits) == -1; - var isValidUnicode = allHexDigits; - - if (!isValidUnicode) - { - Cursor.ResetPosition(start); - - result = []; - return false; - } - - // Advance the cursor by the 4 digits - Cursor.Advance(4); -#else - var isValidUnicode = false; - - Cursor.Advance(); - - if (!Cursor.Eof && Character.IsHexDigit(Cursor.Current)) - { - Cursor.Advance(); - if (!Cursor.Eof && Character.IsHexDigit(Cursor.Current)) - { - Cursor.Advance(); - if (!Cursor.Eof && Character.IsHexDigit(Cursor.Current)) - { - Cursor.Advance(); - if (!Cursor.Eof && Character.IsHexDigit(Cursor.Current)) - { - isValidUnicode = true; - } - } - } - } - - if (!isValidUnicode) - { - Cursor.ResetPosition(start); - - result = []; - return false; - } -#endif - break; - case 'x': - - // At least one digits -#if NET8_0_OR_GREATER - var firstNonHexDigit = Cursor.Span.Length > 1 ? Cursor.Span.Slice(1).IndexOfAnyExcept(Character._hexDigits) : -1; - var isValidHex = firstNonHexDigit > 0; - - if (!isValidHex) - { - Cursor.ResetPosition(start); - - result = []; - return false; - } - - // Advance the cursor for the read digits - Cursor.Advance(firstNonHexDigit); -#else - var isValidHex = false; - - Cursor.Advance(); - - if (!Cursor.Eof && Character.IsHexDigit(Cursor.Current)) - { - isValidHex = true; - - if (!Cursor.Eof && Character.IsHexDigit(Cursor.PeekNext())) - { - Cursor.Advance(); - - if (!Cursor.Eof && Character.IsHexDigit(Cursor.PeekNext())) - { - Cursor.Advance(); - - if (!Cursor.Eof && Character.IsHexDigit(Cursor.PeekNext())) - { - Cursor.Advance(); - } - } - } - } - - if (!isValidHex) - { - Cursor.ResetPosition(start); - - result = []; - return false; - } -#endif - - break; - default: - Cursor.ResetPosition(start); - - result = []; - return false; - } - } - - nextEscape = Cursor.Span.IndexOfAny('\\', startChar); - - if (Cursor.Match(startChar)) - { - // Read end quote - Cursor.Advance(1); - break; - } - else if (nextEscape == -1) - { - Cursor.ResetPosition(start); - - result = []; - return false; - } - } - - result = Cursor.Buffer.AsSpan()[start.Offset..Cursor.Offset]; - - return true; - } -} + beforeGroupPosition = Cursor.Position; + } + } + } + + var beforeDecimalSeparator = Cursor.Position; + + if (allowDecimalSeparator && Cursor.Current == decimalSeparator) + { + Cursor.AdvanceNoNewLines(1); + + var numberIsEmpty = number.IsEmpty; + + if (!ReadInteger(out number)) + { + Cursor.ResetPosition(beforeDecimalSeparator); + + // A decimal separator must be followed by a number if there is no integral part, e.g. `[NaN].[NaN]` + if (numberIsEmpty) + { + return false; + } + + number = Cursor.Buffer.AsSpan(start.Offset, Cursor.Offset - start.Offset); + return true; + } + } + + var beforeExponent = Cursor.Position; + + if (allowExponent && (Cursor.Current is 'e' or 'E')) + { + Cursor.AdvanceNoNewLines(1); + + if (Cursor.Current is '-' or '+') + { + Cursor.AdvanceNoNewLines(1); + } + + // The exponent must be followed by a number, without a group separator, otherwise backtrack to before the exponent + if (!ReadInteger(out _)) + { + Cursor.ResetPosition(beforeExponent); + number = Cursor.Buffer.AsSpan(start.Offset, Cursor.Offset - start.Offset); + return true; + } + } + + number = Cursor.Buffer.AsSpan(start.Offset, Cursor.Offset - start.Offset); + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadInteger() => ReadInteger(out _); + +#if NET8_0_OR_GREATER + public bool ReadInteger(out ReadOnlySpan result) + { + var span = Cursor.Span; + + var noDigitIndex = span.IndexOfAnyExcept(Character._decimalDigits); + + // If first char is not a digit, fail + if (noDigitIndex == 0 || span.IsEmpty) + { + result = []; + return false; + } + + // If all chars are digits + if (noDigitIndex == -1) + { + result = span; + } + else + { + result = span[..noDigitIndex]; + } + + Cursor.AdvanceNoNewLines(result.Length); + + return true; + } +#else + public bool ReadInteger(out ReadOnlySpan result) + { + var next = 0; + while (Character.IsDecimalDigit(Cursor.PeekNext(next))) + { + next += 1; + } + + // Not digit was read + if (next == 0) + { + result = []; + return false; + } + + Cursor.AdvanceNoNewLines(next); + result = Buffer.AsSpan(Cursor.Offset - next, next); + + return true; + } +#endif + + /// + /// Reads a token while the specific predicate is valid. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadWhile(Func predicate) => ReadWhile(predicate, out _); + + /// + /// Reads a token while the specific predicate is valid. + /// + public bool ReadWhile(Func predicate, out ReadOnlySpan result) + { + if (Cursor.Eof || !predicate(Cursor.Current)) + { + result = []; + return false; + } + + var start = Cursor.Offset; + + Cursor.Advance(); + + while (!Cursor.Eof && predicate(Cursor.Current)) + { + Cursor.Advance(); + } + + result = Buffer.AsSpan(start, Cursor.Offset - start); + + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadNonWhiteSpace() => ReadNonWhiteSpace(out _); + + public bool ReadNonWhiteSpace(out ReadOnlySpan result) + { + return ReadWhile(static x => !Character.IsWhiteSpace(x), out result); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadNonWhiteSpaceOrNewLine() => ReadNonWhiteSpaceOrNewLine(out _); + + public bool ReadNonWhiteSpaceOrNewLine(out ReadOnlySpan result) + { + return ReadWhile(static x => !Character.IsWhiteSpaceOrNewLine(x), out result); + } + + /// + /// Reads the specified text. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadChar(char c) + { + if (!Cursor.Match(c)) + { + return false; + } + + Cursor.Advance(); + return true; + } + + /// + /// Reads the specified text. + /// + public bool ReadChar(char c, out ReadOnlySpan result) + { + if (!Cursor.Match(c)) + { + result = []; + return false; + } + + var start = Cursor.Offset; + Cursor.Advance(); + + result = Buffer.AsSpan(start, Cursor.Offset - start); + return true; + } + + /// + /// Reads the specific expected text. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadText(ReadOnlySpan text, StringComparison comparisonType) => ReadText(text, comparisonType, out _); + + /// + /// Reads the specific expected text. + /// + public bool ReadText(ReadOnlySpan text, StringComparison comparisonType, out ReadOnlySpan result) + { + if (!Cursor.Match(text, comparisonType)) + { + result = []; + return false; + } + + var start = Cursor.Offset; + Cursor.Advance(text.Length); + result = Buffer.AsSpan(start, Cursor.Offset - start); + + return true; + } + + /// + /// Reads the specific expected chars. + /// + [Obsolete("Prefer bool ReadAnyOf(ReadOnlySpan, out ReadOnlySpan)")] + public bool ReadAnyOf(ReadOnlySpan chars, StringComparison comparisonType, out ReadOnlySpan result) + { + var current = Cursor.Buffer.AsSpan(Cursor.Offset, 1); + + var index = chars.IndexOf(current, comparisonType); + + if (index == -1) + { + result = []; + return false; + } + + var start = Cursor.Offset; + Cursor.Advance(index + 1); + result = Cursor.Buffer.AsSpan(start, index + 1); + + return true; + } + + /// + /// Reads the specific expected chars. + /// + public bool ReadAnyOf(ReadOnlySpan chars, out ReadOnlySpan result) + { + var start = Cursor.Offset; + + while (true) + { + var current = Cursor.Current; + var index = chars.IndexOf(current); + + if (index == -1) + { + if (Cursor.Offset == start) + { + result = []; + return false; + } + + var length = Cursor.Offset - start; + + result = Cursor.Buffer.AsSpan(start, length); + return true; + } + + if (Cursor.Eof) + { + result = []; + return false; + } + + Cursor.Advance(1); + } + } + +#if NET8_0_OR_GREATER + /// + /// Reads the specific expected chars. + /// + /// + /// This overload uses as this shouldn't be created on every call. The actual implementation of + /// is chosen based on the constituents of the list. The caller should thus reuse the instance. + /// + public bool ReadAnyOf(SearchValues values, out ReadOnlySpan result) + { + var span = Cursor.Span; + + var notInRangeIndex = span.IndexOfAnyExcept(values); + + // If first char is not in range + if (notInRangeIndex == 0 || span.IsEmpty) + { + result = []; + return false; + } + + // All chars match + if (notInRangeIndex == -1) + { + result = span; + } + else + { + result = span[..notInRangeIndex]; + } + + Cursor.Advance(result.Length); + + return true; + } +#endif + + /// + /// Reads the specific expected text. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadText(ReadOnlySpan text) => ReadText(text, out _); + + /// + /// Reads the specific expected text. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadText(ReadOnlySpan text, out ReadOnlySpan result) => ReadText(text, comparisonType: StringComparison.Ordinal, out result); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadSingleQuotedString() => ReadSingleQuotedString(out _); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadSingleQuotedString(out ReadOnlySpan result) + { + return ReadQuotedString('\'', out result); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadDoubleQuotedString() => ReadDoubleQuotedString(out _); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadDoubleQuotedString(out ReadOnlySpan result) + { + return ReadQuotedString('\"', out result); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadBacktickString() => ReadBacktickString(out _); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadBacktickString(out ReadOnlySpan result) + { + return ReadQuotedString('`', out result); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadQuotedString() => ReadQuotedString(out _); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadQuotedString(char[] quoteChar) => ReadQuotedString(quoteChar, out _); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ReadQuotedString(char[] quoteChar, out ReadOnlySpan result) + { + var startChar = Cursor.Current; + + if (!quoteChar.Contains( startChar )) + { + result = []; + return false; + } + + return ReadQuotedString(startChar, out result); + } + + public bool ReadQuotedString(out ReadOnlySpan result) => ReadQuotedString(['\'', '\"'],out result); + + /// + /// Reads a string token enclosed in quotes or custom characters. + /// + /// + /// This method doesn't escape the string, but only validates its content is syntactically correct. + /// The resulting Span contains the original quotes. + /// + public bool ReadQuotedString(char quoteChar, out ReadOnlySpan result) + { + var startChar = Cursor.Current; + var start = Cursor.Position; + + if (startChar != quoteChar) + { + result = []; + return false; + } + + var nextQuote = Cursor.Span.Slice(1).IndexOf(startChar); + + if (nextQuote == -1) + { + // There is no end quote, not a string + result = []; + return false; + } + + var nextEscape = Cursor.Span.IndexOf('\\'); + + // If the next escape is not before the next quote, we can return the string as-is + if (nextEscape == -1 || nextEscape > nextQuote) + { + Cursor.Advance(nextQuote + 2); // include start quote + + result = Cursor.Buffer.AsSpan().Slice(start.Offset, nextQuote + 2); + return true; + } + + while (nextEscape != -1) + { + Cursor.Advance(nextEscape); + + // We can read Eof if there is an escaped quote sequence and no actual end quote, e.g. "'abc\'def" + if (Cursor.Eof) + { + Cursor.ResetPosition(start); + + result = []; + return false; + } + + if (Cursor.Match('\\')) + { + Cursor.Advance(); + + switch (Cursor.Current) + { + case '0': + case '\\': + case 'a': + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + case 'v': + case '\'': + case '"': + Cursor.Advance(); + break; + + case 'u': + + // https://stackoverflow.com/a/32175520/142772 + // exactly 4 digits +#if NET8_0_OR_GREATER + var allHexDigits = Cursor.Span.Length > 4 && Cursor.Span.Slice(1, 4).IndexOfAnyExcept(Character._hexDigits) == -1; + var isValidUnicode = allHexDigits; + + if (!isValidUnicode) + { + Cursor.ResetPosition(start); + + result = []; + return false; + } + + // Advance the cursor by the 4 digits + Cursor.Advance(4); +#else + var isValidUnicode = false; + + Cursor.Advance(); + + if (!Cursor.Eof && Character.IsHexDigit(Cursor.Current)) + { + Cursor.Advance(); + if (!Cursor.Eof && Character.IsHexDigit(Cursor.Current)) + { + Cursor.Advance(); + if (!Cursor.Eof && Character.IsHexDigit(Cursor.Current)) + { + Cursor.Advance(); + if (!Cursor.Eof && Character.IsHexDigit(Cursor.Current)) + { + isValidUnicode = true; + } + } + } + } + + if (!isValidUnicode) + { + Cursor.ResetPosition(start); + + result = []; + return false; + } +#endif + break; + case 'x': + + // At least one digits +#if NET8_0_OR_GREATER + var firstNonHexDigit = Cursor.Span.Length > 1 ? Cursor.Span.Slice(1).IndexOfAnyExcept(Character._hexDigits) : -1; + var isValidHex = firstNonHexDigit > 0; + + if (!isValidHex) + { + Cursor.ResetPosition(start); + + result = []; + return false; + } + + // Advance the cursor for the read digits + Cursor.Advance(firstNonHexDigit); +#else + var isValidHex = false; + + Cursor.Advance(); + + if (!Cursor.Eof && Character.IsHexDigit(Cursor.Current)) + { + isValidHex = true; + + if (!Cursor.Eof && Character.IsHexDigit(Cursor.PeekNext())) + { + Cursor.Advance(); + + if (!Cursor.Eof && Character.IsHexDigit(Cursor.PeekNext())) + { + Cursor.Advance(); + + if (!Cursor.Eof && Character.IsHexDigit(Cursor.PeekNext())) + { + Cursor.Advance(); + } + } + } + } + + if (!isValidHex) + { + Cursor.ResetPosition(start); + + result = []; + return false; + } +#endif + + break; + default: + Cursor.ResetPosition(start); + + result = []; + return false; + } + } + + nextEscape = Cursor.Span.IndexOfAny('\\', startChar); + + if (Cursor.Match(startChar)) + { + // Read end quote + Cursor.Advance(1); + break; + } + else if (nextEscape == -1) + { + Cursor.ResetPosition(start); + + result = []; + return false; + } + } + + result = Cursor.Buffer.AsSpan()[start.Offset..Cursor.Offset]; + + return true; + } +} diff --git a/test/Parlot.Tests/ScannerTests.cs b/test/Parlot.Tests/ScannerTests.cs index 30feb30..b69e096 100644 --- a/test/Parlot.Tests/ScannerTests.cs +++ b/test/Parlot.Tests/ScannerTests.cs @@ -1,3 +1,4 @@ +using Parlot.Tests.Calc; using System; using System.Buffers; @@ -294,6 +295,7 @@ public void ShouldNotReadInvalidInteger(string text) [InlineData("123a", "123")] [InlineData("123.0", "123")] [InlineData("123.0a", "123")] + [InlineData("123.", "123")] [InlineData("123 ", "123")] public void ShouldReadValidInteger(string text, string expected) { @@ -303,10 +305,12 @@ public void ShouldReadValidInteger(string text, string expected) [Theory] [InlineData(" 1")] - [InlineData("123.e")] + [InlineData("abc")] + [InlineData(".")] + [InlineData(",")] public void ShouldNotReadInvalidDecimal(string text) { - Assert.False(new Scanner(text).ReadDecimal()); + Assert.False(new Scanner(text).ReadDecimal(Fluent.NumberOptions.Any, groupSeparator: ',', decimalSeparator: '.', number: out _)); } [Theory] @@ -377,39 +381,34 @@ public void ShouldReadNumberWithMultipleGroupSeparators(string input, string exp } [Theory] - [InlineData("123,", "123", ",")] - [InlineData("123,a", "123", ",a")] - public void ShouldReadNumberWithTrailingDecimalSeparators(string input, string expected, string expected2) + [InlineData("123", "123")] + [InlineData("123,123", "123,123")] + [InlineData("123,a", "123")] + [InlineData("123,123,a", "123,123")] + [InlineData("123,123,123", "123,123,123")] + [InlineData("123,.1", "123")] + [InlineData("123,.e", "123")] + [InlineData("123,e", "123")] + [InlineData("123,", "123")] + public void ShouldReadDecimalWithGroupSeparator(string input, string expected) { Scanner s = new(input); - Assert.True(s.ReadDecimal(Fluent.NumberOptions.AllowDecimalSeparator, out var result)); + Assert.True(s.ReadDecimal(Fluent.NumberOptions.AllowGroupSeparators | Fluent.NumberOptions.AllowDecimalSeparator, out var result, groupSeparator: ',', decimalSeparator: '.')); Assert.Equal(expected, result); - Assert.True(s.ReadNonWhiteSpace(out var result2)); - Assert.Equal(expected2, result2); } [Theory] - [InlineData("1, 2, 3", "1", "2", "3")] - public void ShouldReadNumberListWithDecimalSeparators(string input, string expected1, string expected2, string expected3) + [InlineData("123.456", "123.456")] + [InlineData("123.456a", "123.456")] + [InlineData("123.a", "123")] + [InlineData("123.456.789", "123.456")] + [InlineData("123.", "123")] + public void ShouldReadDecimalWithDecimalSeparator(string input, string expected) { Scanner s = new(input); - Assert.True(s.ReadDecimal(Fluent.NumberOptions.AllowDecimalSeparator, out var result)); - Assert.Equal(expected1, result); - Assert.True(s.ReadNonWhiteSpace(out var resultSep)); - Assert.Equal(",", resultSep); - Assert.True(s.SkipWhiteSpace()); - - Assert.True(s.ReadDecimal(Fluent.NumberOptions.AllowDecimalSeparator, out result)); - Assert.Equal(expected2, result); - Assert.True(s.ReadNonWhiteSpace(out resultSep)); - Assert.Equal(",", resultSep); - Assert.True(s.SkipWhiteSpace()); - - Assert.True(s.ReadDecimal(Fluent.NumberOptions.AllowDecimalSeparator, out result)); - Assert.Equal(expected3, result); + Assert.True(s.ReadDecimal(Fluent.NumberOptions.AllowDecimalSeparator, out var result, decimalSeparator: '.')); + Assert.Equal(expected, result); } - - }