Skip to content

Commit 9e302a5

Browse files
Share unified UTF-8/16 parsing with BigInteger (#95402)
* Move IUtfChar implementation from CoreLib * Polyfill CoreLib internals * Move AllowHyphenDuringParsing to method for polyfill * Re-apply exponent limitation change * Format --------- Co-authored-by: Tanner Gooding <[email protected]>
1 parent 64ba05c commit 9e302a5

File tree

6 files changed

+199
-424
lines changed

6 files changed

+199
-424
lines changed

src/libraries/Common/src/System/Number.Parsing.Common.cs

Lines changed: 72 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ namespace System
1010
{
1111
internal static partial class Number
1212
{
13-
private static unsafe bool TryParseNumber(scoped ref char* str, char* strEnd, NumberStyles styles, ref NumberBuffer number, NumberFormatInfo info)
13+
private static unsafe bool TryParseNumber<TChar>(scoped ref TChar* str, TChar* strEnd, NumberStyles styles, ref NumberBuffer number, NumberFormatInfo info)
14+
where TChar : unmanaged, IUtfChar<TChar>
1415
{
1516
Debug.Assert(str != null);
1617
Debug.Assert(strEnd != null);
@@ -31,39 +32,39 @@ private static unsafe bool TryParseNumber(scoped ref char* str, char* strEnd, Nu
3132

3233
number.CheckConsistency();
3334

34-
string decSep; // decimal separator from NumberFormatInfo.
35-
string groupSep; // group separator from NumberFormatInfo.
36-
string? currSymbol = null; // currency symbol from NumberFormatInfo.
35+
ReadOnlySpan<TChar> decSep; // decimal separator from NumberFormatInfo.
36+
ReadOnlySpan<TChar> groupSep; // group separator from NumberFormatInfo.
37+
ReadOnlySpan<TChar> currSymbol = ReadOnlySpan<TChar>.Empty; // currency symbol from NumberFormatInfo.
3738

3839
bool parsingCurrency = false;
3940
if ((styles & NumberStyles.AllowCurrencySymbol) != 0)
4041
{
41-
currSymbol = info.CurrencySymbol;
42+
currSymbol = info.CurrencySymbolTChar<TChar>();
4243

4344
// The idea here is to match the currency separators and on failure match the number separators to keep the perf of VB's IsNumeric fast.
4445
// The values of decSep are setup to use the correct relevant separator (currency in the if part and decimal in the else part).
45-
decSep = info.CurrencyDecimalSeparator;
46-
groupSep = info.CurrencyGroupSeparator;
46+
decSep = info.CurrencyDecimalSeparatorTChar<TChar>();
47+
groupSep = info.CurrencyGroupSeparatorTChar<TChar>();
4748
parsingCurrency = true;
4849
}
4950
else
5051
{
51-
decSep = info.NumberDecimalSeparator;
52-
groupSep = info.NumberGroupSeparator;
52+
decSep = info.NumberDecimalSeparatorTChar<TChar>();
53+
groupSep = info.NumberGroupSeparatorTChar<TChar>();
5354
}
5455

5556
int state = 0;
56-
char* p = str;
57-
char ch = p < strEnd ? *p : '\0';
58-
char* next;
57+
TChar* p = str;
58+
uint ch = (p < strEnd) ? TChar.CastToUInt32(*p) : '\0';
59+
TChar* next;
5960

6061
while (true)
6162
{
6263
// Eat whitespace unless we've found a sign which isn't followed by a currency symbol.
6364
// "-Kr 1231.47" is legal but "- 1231.47" is not.
64-
if (!IsWhite(ch) || (styles & NumberStyles.AllowLeadingWhite) == 0 || ((state & StateSign) != 0 && ((state & StateCurrency) == 0 && info.NumberNegativePattern != 2)))
65+
if (!IsWhite(ch) || (styles & NumberStyles.AllowLeadingWhite) == 0 || ((state & StateSign) != 0 && (state & StateCurrency) == 0 && info.NumberNegativePattern != 2))
6566
{
66-
if ((((styles & NumberStyles.AllowLeadingSign) != 0) && (state & StateSign) == 0) && ((next = MatchChars(p, strEnd, info.PositiveSign)) != null || ((next = MatchNegativeSignChars(p, strEnd, info)) != null && (number.IsNegative = true))))
67+
if (((styles & NumberStyles.AllowLeadingSign) != 0) && (state & StateSign) == 0 && ((next = MatchChars(p, strEnd, info.PositiveSignTChar<TChar>())) != null || ((next = MatchNegativeSignChars(p, strEnd, info)) != null && (number.IsNegative = true))))
6768
{
6869
state |= StateSign;
6970
p = next - 1;
@@ -73,10 +74,10 @@ private static unsafe bool TryParseNumber(scoped ref char* str, char* strEnd, Nu
7374
state |= StateSign | StateParens;
7475
number.IsNegative = true;
7576
}
76-
else if (currSymbol != null && (next = MatchChars(p, strEnd, currSymbol)) != null)
77+
else if (!currSymbol.IsEmpty && (next = MatchChars(p, strEnd, currSymbol)) != null)
7778
{
7879
state |= StateCurrency;
79-
currSymbol = null;
80+
currSymbol = ReadOnlySpan<TChar>.Empty;
8081
// We already found the currency symbol. There should not be more currency symbols. Set
8182
// currSymbol to NULL so that we won't search it again in the later code path.
8283
p = next - 1;
@@ -86,7 +87,7 @@ private static unsafe bool TryParseNumber(scoped ref char* str, char* strEnd, Nu
8687
break;
8788
}
8889
}
89-
ch = ++p < strEnd ? *p : '\0';
90+
ch = ++p < strEnd ? TChar.CastToUInt32(*p) : '\0';
9091
}
9192

9293
int digCount = 0;
@@ -104,7 +105,7 @@ private static unsafe bool TryParseNumber(scoped ref char* str, char* strEnd, Nu
104105
{
105106
if (digCount < maxDigCount)
106107
{
107-
number.Digits[digCount] = (byte)(ch);
108+
number.Digits[digCount] = (byte)ch;
108109
if ((ch != '0') || (number.Kind != NumberBufferKind.Integer))
109110
{
110111
digEnd = digCount + 1;
@@ -147,38 +148,38 @@ private static unsafe bool TryParseNumber(scoped ref char* str, char* strEnd, Nu
147148
number.Scale--;
148149
}
149150
}
150-
else if (((styles & NumberStyles.AllowDecimalPoint) != 0) && ((state & StateDecimal) == 0) && ((next = MatchChars(p, strEnd, decSep)) != null || (parsingCurrency && (state & StateCurrency) == 0) && (next = MatchChars(p, strEnd, info.NumberDecimalSeparator)) != null))
151+
else if (((styles & NumberStyles.AllowDecimalPoint) != 0) && ((state & StateDecimal) == 0) && ((next = MatchChars(p, strEnd, decSep)) != null || (parsingCurrency && (state & StateCurrency) == 0 && (next = MatchChars(p, strEnd, info.NumberDecimalSeparatorTChar<TChar>())) != null)))
151152
{
152153
state |= StateDecimal;
153154
p = next - 1;
154155
}
155-
else if (((styles & NumberStyles.AllowThousands) != 0) && ((state & StateDigits) != 0) && ((state & StateDecimal) == 0) && ((next = MatchChars(p, strEnd, groupSep)) != null || (parsingCurrency && (state & StateCurrency) == 0) && (next = MatchChars(p, strEnd, info.NumberGroupSeparator)) != null))
156+
else if (((styles & NumberStyles.AllowThousands) != 0) && ((state & StateDigits) != 0) && ((state & StateDecimal) == 0) && ((next = MatchChars(p, strEnd, groupSep)) != null || (parsingCurrency && (state & StateCurrency) == 0 && (next = MatchChars(p, strEnd, info.NumberGroupSeparatorTChar<TChar>())) != null)))
156157
{
157158
p = next - 1;
158159
}
159160
else
160161
{
161162
break;
162163
}
163-
ch = ++p < strEnd ? *p : '\0';
164+
ch = ++p < strEnd ? TChar.CastToUInt32(*p) : '\0';
164165
}
165166

166167
bool negExp = false;
167168
number.DigitsCount = digEnd;
168-
number.Digits[digEnd] = (byte)('\0');
169+
number.Digits[digEnd] = (byte)'\0';
169170
if ((state & StateDigits) != 0)
170171
{
171172
if ((ch == 'E' || ch == 'e') && ((styles & NumberStyles.AllowExponent) != 0))
172173
{
173-
char* temp = p;
174-
ch = ++p < strEnd ? *p : '\0';
175-
if ((next = MatchChars(p, strEnd, info.PositiveSign)) != null)
174+
TChar* temp = p;
175+
ch = ++p < strEnd ? TChar.CastToUInt32(*p) : '\0';
176+
if ((next = MatchChars(p, strEnd, info.PositiveSignTChar<TChar>())) != null)
176177
{
177-
ch = (p = next) < strEnd ? *p : '\0';
178+
ch = (p = next) < strEnd ? TChar.CastToUInt32(*p) : '\0';
178179
}
179180
else if ((next = MatchNegativeSignChars(p, strEnd, info)) != null)
180181
{
181-
ch = (p = next) < strEnd ? *p : '\0';
182+
ch = (p = next) < strEnd ? TChar.CastToUInt32(*p) : '\0';
182183
negExp = true;
183184
}
184185
if (IsDigit(ch))
@@ -194,15 +195,15 @@ private static unsafe bool TryParseNumber(scoped ref char* str, char* strEnd, Nu
194195
number.Scale = 0;
195196

196197
// Finish parsing the number, a FormatException could still occur later on.
197-
while (char.IsAsciiDigit(ch))
198+
while (IsDigit(ch))
198199
{
199-
ch = ++p < strEnd ? *p : '\0';
200+
ch = ++p < strEnd ? TChar.CastToUInt32(*p) : '\0';
200201
}
201202
break;
202203
}
203204

204-
exp = exp * 10 + (ch - '0');
205-
ch = ++p < strEnd ? *p : '\0';
205+
exp = (exp * 10) + (int)(ch - '0');
206+
ch = ++p < strEnd ? TChar.CastToUInt32(*p) : '\0';
206207
} while (IsDigit(ch));
207208
if (negExp)
208209
{
@@ -213,7 +214,7 @@ private static unsafe bool TryParseNumber(scoped ref char* str, char* strEnd, Nu
213214
else
214215
{
215216
p = temp;
216-
ch = p < strEnd ? *p : '\0';
217+
ch = p < strEnd ? TChar.CastToUInt32(*p) : '\0';
217218
}
218219
}
219220

@@ -226,15 +227,15 @@ private static unsafe bool TryParseNumber(scoped ref char* str, char* strEnd, Nu
226227
numberOfTrailingZeros = Math.Min(numberOfTrailingZeros, numberOfFractionalDigits);
227228
Debug.Assert(numberOfTrailingZeros >= 0);
228229
number.DigitsCount = digEnd - numberOfTrailingZeros;
229-
number.Digits[number.DigitsCount] = (byte)('\0');
230+
number.Digits[number.DigitsCount] = (byte)'\0';
230231
}
231232
}
232233

233234
while (true)
234235
{
235236
if (!IsWhite(ch) || (styles & NumberStyles.AllowTrailingWhite) == 0)
236237
{
237-
if ((styles & NumberStyles.AllowTrailingSign) != 0 && ((state & StateSign) == 0) && ((next = MatchChars(p, strEnd, info.PositiveSign)) != null || (((next = MatchNegativeSignChars(p, strEnd, info)) != null) && (number.IsNegative = true))))
238+
if ((styles & NumberStyles.AllowTrailingSign) != 0 && ((state & StateSign) == 0) && ((next = MatchChars(p, strEnd, info.PositiveSignTChar<TChar>())) != null || (((next = MatchNegativeSignChars(p, strEnd, info)) != null) && (number.IsNegative = true))))
238239
{
239240
state |= StateSign;
240241
p = next - 1;
@@ -243,17 +244,17 @@ private static unsafe bool TryParseNumber(scoped ref char* str, char* strEnd, Nu
243244
{
244245
state &= ~StateParens;
245246
}
246-
else if (currSymbol != null && (next = MatchChars(p, strEnd, currSymbol)) != null)
247+
else if (!currSymbol.IsEmpty && (next = MatchChars(p, strEnd, currSymbol)) != null)
247248
{
248-
currSymbol = null;
249+
currSymbol = ReadOnlySpan<TChar>.Empty;
249250
p = next - 1;
250251
}
251252
else
252253
{
253254
break;
254255
}
255256
}
256-
ch = ++p < strEnd ? *p : '\0';
257+
ch = ++p < strEnd ? TChar.CastToUInt32(*p) : '\0';
257258
}
258259
if ((state & StateParens) == 0)
259260
{
@@ -276,12 +277,15 @@ private static unsafe bool TryParseNumber(scoped ref char* str, char* strEnd, Nu
276277
return false;
277278
}
278279

279-
internal static unsafe bool TryStringToNumber(ReadOnlySpan<char> value, NumberStyles styles, ref NumberBuffer number, NumberFormatInfo info)
280+
internal static unsafe bool TryStringToNumber<TChar>(ReadOnlySpan<TChar> value, NumberStyles styles, ref NumberBuffer number, NumberFormatInfo info)
281+
where TChar : unmanaged, IUtfChar<TChar>
280282
{
281283
Debug.Assert(info != null);
282-
fixed (char* stringPointer = &MemoryMarshal.GetReference(value))
284+
285+
fixed (TChar* stringPointer = &MemoryMarshal.GetReference(value))
283286
{
284-
char* p = stringPointer;
287+
TChar* p = stringPointer;
288+
285289
if (!TryParseNumber(ref p, p + value.Length, styles, ref number, info)
286290
|| ((int)(p - stringPointer) < value.Length && !TrailingZeros(value, (int)(p - stringPointer))))
287291
{
@@ -295,9 +299,12 @@ internal static unsafe bool TryStringToNumber(ReadOnlySpan<char> value, NumberSt
295299
}
296300

297301
[MethodImpl(MethodImplOptions.NoInlining)] // rare slow path that shouldn't impact perf of the main use case
298-
private static bool TrailingZeros(ReadOnlySpan<char> value, int index) =>
302+
private static bool TrailingZeros<TChar>(ReadOnlySpan<TChar> value, int index)
303+
where TChar : unmanaged, IUtfChar<TChar>
304+
{
299305
// For compatibility, we need to allow trailing zeros at the end of a number string
300-
value.Slice(index).IndexOfAnyExcept('\0') < 0;
306+
return !value.Slice(index).ContainsAnyExcept(TChar.CastFrom('\0'));
307+
}
301308

302309
private static bool IsWhite(uint ch) => (ch == 0x20) || ((ch - 0x09) <= (0x0D - 0x09));
303310

@@ -310,69 +317,58 @@ internal enum ParsingStatus
310317
Overflow
311318
}
312319

313-
private static bool IsSpaceReplacingChar(char c) => c == '\u00a0' || c == '\u202f';
320+
private static bool IsSpaceReplacingChar(uint c) => (c == '\u00a0') || (c == '\u202f');
314321

315322
[MethodImpl(MethodImplOptions.AggressiveInlining)]
316-
private static unsafe char* MatchNegativeSignChars(char* p, char* pEnd, NumberFormatInfo info)
323+
private static unsafe TChar* MatchNegativeSignChars<TChar>(TChar* p, TChar* pEnd, NumberFormatInfo info)
324+
where TChar : unmanaged, IUtfChar<TChar>
317325
{
318-
char* ret = MatchChars(p, pEnd, info.NegativeSign);
319-
if (ret == null && GetAllowHyphenDuringParsing(info) && p < pEnd && *p == '-')
326+
TChar* ret = MatchChars(p, pEnd, info.NegativeSignTChar<TChar>());
327+
328+
if ((ret is null) && info.AllowHyphenDuringParsing() && (p < pEnd) && (TChar.CastToUInt32(*p) == '-'))
320329
{
321330
ret = p + 1;
322331
}
323332

324333
return ret;
325334
}
326335

327-
private static unsafe char* MatchChars(char* p, char* pEnd, string value)
336+
private static unsafe TChar* MatchChars<TChar>(TChar* p, TChar* pEnd, ReadOnlySpan<TChar> value)
337+
where TChar : unmanaged, IUtfChar<TChar>
328338
{
329-
Debug.Assert(p != null && pEnd != null && p <= pEnd && value != null);
330-
fixed (char* stringPointer = value)
339+
Debug.Assert((p != null) && (pEnd != null) && (p <= pEnd) && (value != null));
340+
341+
fixed (TChar* stringPointer = &MemoryMarshal.GetReference(value))
331342
{
332-
char* str = stringPointer;
333-
if (*str != '\0')
343+
TChar* str = stringPointer;
344+
345+
if (TChar.CastToUInt32(*str) != '\0')
334346
{
335347
// We only hurt the failure case
336348
// This fix is for French or Kazakh cultures. Since a user cannot type 0xA0 or 0x202F as a
337349
// space character we use 0x20 space character instead to mean the same.
338350
while (true)
339351
{
340-
char cp = p < pEnd ? *p : '\0';
341-
if (cp != *str && !(IsSpaceReplacingChar(*str) && cp == '\u0020'))
352+
uint cp = (p < pEnd) ? TChar.CastToUInt32(*p) : '\0';
353+
uint val = TChar.CastToUInt32(*str);
354+
355+
if ((cp != val) && !(IsSpaceReplacingChar(val) && (cp == '\u0020')))
342356
{
343357
break;
344358
}
359+
345360
p++;
346361
str++;
347-
if (*str == '\0')
362+
363+
if (TChar.CastToUInt32(*str) == '\0')
364+
{
348365
return p;
366+
}
349367
}
350368
}
351369
}
352370

353371
return null;
354372
}
355-
356-
// Helper for internal property
357-
#if SYSTEM_PRIVATE_CORELIB
358-
private static bool GetAllowHyphenDuringParsing(NumberFormatInfo info) => info.AllowHyphenDuringParsing;
359-
#else
360-
private static bool GetAllowHyphenDuringParsing(NumberFormatInfo info)
361-
{
362-
string negativeSign = info.NegativeSign;
363-
return negativeSign.Length == 1 &&
364-
negativeSign[0] switch
365-
{
366-
'\u2012' or // Figure Dash
367-
'\u207B' or // Superscript Minus
368-
'\u208B' or // Subscript Minus
369-
'\u2212' or // Minus Sign
370-
'\u2796' or // Heavy Minus Sign
371-
'\uFE63' or // Small Hyphen-Minus
372-
'\uFF0D' => true, // Fullwidth Hyphen-Minus
373-
_ => false
374-
};
375-
}
376-
#endif
377373
}
378374
}

src/libraries/System.Private.CoreLib/src/System/Globalization/NumberFormatInfo.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ private static void VerifyDigitSubstitution(DigitShapes digitSub, string propert
157157
}
158158

159159
internal bool HasInvariantNumberSigns => _hasInvariantNumberSigns;
160-
internal bool AllowHyphenDuringParsing => _allowHyphenDuringParsing;
160+
internal bool AllowHyphenDuringParsing() => _allowHyphenDuringParsing;
161161

162162
private void InitializeInvariantAndNegativeSignFlags()
163163
{

0 commit comments

Comments
 (0)