diff --git a/eng/Versions.props b/eng/Versions.props
index 6938c1b5e09bfb..919ce843b1f0ab 100644
--- a/eng/Versions.props
+++ b/eng/Versions.props
@@ -61,6 +61,7 @@
5.0.0-beta.19608.5
5.0.0-beta.19608.5
5.0.0-beta.19608.5
+ 5.0.0-beta.19610.1
5.0.0-beta.19608.5
5.0.0-beta.19608.5
diff --git a/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/CategoryCasingInfo.cs b/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/CategoryCasingInfo.cs
new file mode 100644
index 00000000000000..f889fad3d1d1ea
--- /dev/null
+++ b/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/CategoryCasingInfo.cs
@@ -0,0 +1,119 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Buffers.Binary;
+using System.Globalization;
+using System.Text;
+using System.Text.Unicode;
+
+namespace GenUnicodeProp
+{
+ ///
+ /// Contains information about a code point's Unicode category,
+ /// bidi class, and simple case mapping / folding.
+ ///
+ internal sealed class CategoryCasingInfo : IEquatable
+ {
+ private readonly (UnicodeCategory generalCategory,
+ StrongBidiCategory strongBidiCategory,
+ ushort offsetToSimpleUppercase,
+ ushort offsetToSimpleLowercase,
+ ushort offsetToSimpleTitlecase,
+ ushort offsetToSimpleCasefold,
+ bool isWhitespace) _data;
+
+ public CategoryCasingInfo(CodePoint codePoint)
+ {
+ _data.generalCategory = codePoint.GeneralCategory;
+
+ switch (codePoint.BidiClass)
+ {
+ case BidiClass.Left_To_Right:
+ _data.strongBidiCategory = StrongBidiCategory.StrongLeftToRight;
+ break;
+
+ case BidiClass.Right_To_Left:
+ case BidiClass.Arabic_Letter:
+ _data.strongBidiCategory = StrongBidiCategory.StrongRightToLeft;
+ break;
+
+ default:
+ _data.strongBidiCategory = StrongBidiCategory.Other;
+ break;
+ }
+
+ if (Program.IncludeCasingData)
+ {
+ _data.offsetToSimpleUppercase = (ushort)(codePoint.SimpleUppercaseMapping - codePoint.Value);
+ _data.offsetToSimpleLowercase = (ushort)(codePoint.SimpleLowercaseMapping - codePoint.Value);
+ _data.offsetToSimpleTitlecase = (ushort)(codePoint.SimpleTitlecaseMapping - codePoint.Value);
+ _data.offsetToSimpleCasefold = (ushort)(codePoint.SimpleCaseFoldMapping - codePoint.Value);
+ }
+ else
+ {
+ _data.offsetToSimpleUppercase = default;
+ _data.offsetToSimpleLowercase = default;
+ _data.offsetToSimpleTitlecase = default;
+ _data.offsetToSimpleCasefold = default;
+ }
+
+ _data.isWhitespace = codePoint.Flags.HasFlag(CodePointFlags.White_Space);
+ }
+
+ public override bool Equals(object obj) => Equals(obj as CategoryCasingInfo);
+
+ public bool Equals(CategoryCasingInfo other)
+ {
+ return !(other is null) && this._data.Equals(other._data);
+ }
+
+ public override int GetHashCode()
+ {
+ return _data.GetHashCode();
+ }
+
+ public static byte[] ToCategoryBytes(CategoryCasingInfo input)
+ {
+ // We're storing 3 pieces of information in 8 bits:
+ // bit 7 (high bit) = isWhitespace?
+ // bits 6..5 = restricted bidi class
+ // bits 4..0 = Unicode category
+
+ int combinedValue = Convert.ToInt32(input._data.isWhitespace) << 7;
+ combinedValue += (int)input._data.strongBidiCategory << 5;
+ combinedValue += (int)input._data.generalCategory;
+
+ return new byte[] { checked((byte)combinedValue) };
+ }
+
+ public static byte[] ToUpperBytes(CategoryCasingInfo input)
+ {
+ byte[] bytes = new byte[sizeof(ushort)];
+ BinaryPrimitives.WriteUInt16LittleEndian(bytes, input._data.offsetToSimpleUppercase);
+ return bytes;
+ }
+
+ public static byte[] ToLowerBytes(CategoryCasingInfo input)
+ {
+ byte[] bytes = new byte[sizeof(ushort)];
+ BinaryPrimitives.WriteUInt16LittleEndian(bytes, input._data.offsetToSimpleLowercase);
+ return bytes;
+ }
+
+ public static byte[] ToTitleBytes(CategoryCasingInfo input)
+ {
+ byte[] bytes = new byte[sizeof(ushort)];
+ BinaryPrimitives.WriteUInt16LittleEndian(bytes, input._data.offsetToSimpleTitlecase);
+ return bytes;
+ }
+
+ public static byte[] ToCaseFoldBytes(CategoryCasingInfo input)
+ {
+ byte[] bytes = new byte[sizeof(ushort)];
+ BinaryPrimitives.WriteUInt16LittleEndian(bytes, input._data.offsetToSimpleCasefold);
+ return bytes;
+ }
+ }
+}
diff --git a/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/DataTable.cs b/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/DataTable.cs
index d2bc25364e7555..c49c3660defc66 100644
--- a/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/DataTable.cs
+++ b/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/DataTable.cs
@@ -170,30 +170,4 @@ public byte[][] GetBytes()
return new[] { Level1Index.ToArray(), level2.ToArray(), Level3Data.ToArray() };
}
}
-
- internal sealed class FlatDataTable
- {
- // If a codepoint does not have data, this specifies the default value.
- private readonly string DefaultValue;
- private readonly Func GetValueBytesCallback;
-
- // This contains the data mapping between codepoints and values.
- private readonly SortedDictionary RawData = new SortedDictionary();
-
- public FlatDataTable(string defaultValue, Func getValueBytesCallback)
- {
- DefaultValue = defaultValue;
- GetValueBytesCallback = getValueBytesCallback;
- }
-
- public void AddData(uint codepoint, string value) => RawData[codepoint] = value;
-
- public byte[] GetBytesFlat()
- {
- var str = new List();
- foreach (var v in RawData.Values)
- str.AddRange(GetValueBytesCallback(v ?? DefaultValue));
- return str.ToArray();
- }
- }
}
diff --git a/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/GenUnicodeProp.csproj b/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/GenUnicodeProp.csproj
index b4c010bfc67b6d..68c616ea14ab7b 100644
--- a/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/GenUnicodeProp.csproj
+++ b/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/GenUnicodeProp.csproj
@@ -2,7 +2,51 @@
Exe
- $(NetCoreAppCurrent)
+ netcoreapp3.0
+ 12.1
+
+
+
+
+
+
+
+ UnicodeData\CaseFolding.txt
+ CaseFolding.txt
+
+
+ UnicodeData\PropList.txt
+ PropList.txt
+
+
+ UnicodeData\UnicodeData.txt
+ UnicodeData.txt
+
+
+ UnicodeData\GraphemeBreakProperty.txt
+ GraphemeBreakProperty.txt
+
+
+ UnicodeData\DerivedBidiClass.txt
+ DerivedBidiClass.txt
+
+
+ UnicodeData\DerivedName.txt
+ DerivedName.txt
+
+
+ UnicodeData\emoji-data.txt
+ emoji-data.txt
+
+
+
+
+
+
+
+
+
+
diff --git a/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/NumericGraphemeInfo.cs b/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/NumericGraphemeInfo.cs
new file mode 100644
index 00000000000000..9f0e4673914fed
--- /dev/null
+++ b/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/NumericGraphemeInfo.cs
@@ -0,0 +1,69 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Buffers.Binary;
+using System.Runtime.CompilerServices;
+using System.Text.Unicode;
+
+namespace GenUnicodeProp
+{
+ ///
+ /// Contains information about a code point's numeric representation
+ /// and the manner in which it's treated for grapheme cluster segmentation
+ /// purposes.
+ ///
+ internal sealed class NumericGraphemeInfo : IEquatable
+ {
+ public readonly (int decimalDigitValue,
+ int digitValue,
+ double numericValue,
+ GraphemeClusterBreakProperty graphemeClusterBreakProperty) _data;
+
+ public NumericGraphemeInfo(CodePoint codePoint)
+ {
+ _data.decimalDigitValue = codePoint.DecimalDigitValue;
+ _data.digitValue = codePoint.DigitValue;
+ _data.numericValue = codePoint.NumericValue;
+ _data.graphemeClusterBreakProperty = codePoint.GraphemeClusterBreakProperty;
+ }
+
+ public override bool Equals(object obj) => Equals(obj as NumericGraphemeInfo);
+
+ public bool Equals(NumericGraphemeInfo other)
+ {
+ return !(other is null) && this._data.Equals(other._data);
+ }
+
+ public override int GetHashCode()
+ {
+ return _data.GetHashCode();
+ }
+
+ public static byte[] ToDigitBytes(NumericGraphemeInfo input)
+ {
+ // Bits 4 .. 7 contain (decimalDigitValue + 1).
+ // Bits 0 .. 3 contain (digitValue + 1).
+ // This means that each nibble will have a value 0x0 .. 0xa, inclusive.
+
+ int adjustedDecimalDigitValue = input._data.decimalDigitValue + 1;
+ int adjustedDigitValue = input._data.digitValue + 1;
+
+ return new byte[] { (byte)((adjustedDecimalDigitValue << 4) | adjustedDigitValue) };
+ }
+
+ public static byte[] ToNumericBytes(NumericGraphemeInfo input)
+ {
+ byte[] bytes = new byte[sizeof(double)];
+ double value = input._data.numericValue;
+ BinaryPrimitives.WriteUInt64LittleEndian(bytes, Unsafe.As(ref value));
+ return bytes;
+ }
+
+ public static byte[] ToGraphemeBytes(NumericGraphemeInfo input)
+ {
+ return new byte[] { checked((byte)input._data.graphemeClusterBreakProperty) };
+ }
+ }
+}
diff --git a/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/Program.cs b/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/Program.cs
index f731190f856b31..3c1156e7e35e58 100644
--- a/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/Program.cs
+++ b/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/Program.cs
@@ -4,358 +4,165 @@
using System;
using System.Collections.Generic;
-using System.Globalization;
using System.IO;
+using System.Linq;
+using System.Text.Unicode;
namespace GenUnicodeProp
{
internal static class Program
{
+ internal static bool Verbose = false;
+ internal static bool IncludeCasingData = false;
+
+ private const string SOURCE_NAME = "CharUnicodeInfoData.cs";
+
private static void Main(string[] args)
{
- Verbose = false;
- // TODO: parse args
-
- var defaultCategoryValues = "Cn,L";
-
- // Create a 12:4:4 table for Unicode category
- // "Cn", Not assigned. The value is 1 byte to indicate Unicode category
- // Make sure to put the default value into the slot 0 in $categoriesValueTable
- var categoriesIndexTable = new DataTable();
- // Create a 12:4:4 table for decimal digit value/digit value/numeric value
- var numericIndexTable = new DataTable();
- // Create a flat table for Unicode category and BiDi category
- var categoriesValueTable = new FlatDataTable(defaultCategoryValues, GetCategoriesValueBytes);
- // Create a flat table.
- // GetNumericValueBytes() is the callback used to generate the bytes of each item.
- var numericValueTable = new FlatDataTable("-1", GetNumericValueBytes);
- // Create a flat table for digit values
- // GetDigitValueBytes() is the callback used to generate the bytes of each item.
- var digitValueTable = new FlatDataTable("255,255", GetDigitValueBytes);
-
- // Add a default item into the category value table. This will be the item 0 in the category value table.
- GetCategoryValueItem(categoriesValueTable, defaultCategoryValues);
- NumberValues.Add("-1,255,255", 0);
- numericValueTable.AddData(0, "-1");
- digitValueTable.AddData(0, "255,255");
-
- ReadSourceFile("UnicodeData.txt", categoriesIndexTable, categoriesValueTable, numericIndexTable, numericValueTable, digitValueTable);
-
- categoriesIndexTable.GenerateTable(nameof(categoriesIndexTable), 5, 4);
- //categoriesIndexTable.CalculateTableVariants();
- numericIndexTable.GenerateTable(nameof(numericIndexTable), 4, 4, cutOff: true);
- //numericIndexTable.CalculateTableVariants(cutOff: true);
-
- // generate the data C# source
+ Verbose = args.Contains("-Verbose", StringComparer.OrdinalIgnoreCase);
+ IncludeCasingData = args.Contains("-IncludeCasingData", StringComparer.OrdinalIgnoreCase);
+
+ // First, read the data files and build up a list of all
+ // assigned code points.
+
+ Console.WriteLine("Reading Unicode data files...");
+
+ _ = UnicodeData.GetData(0); // processes files
+
+ Console.WriteLine("Finished.");
+ Console.WriteLine();
+
+ Console.WriteLine("Initializing maps...");
+ Dictionary categoryCasingMap = new Dictionary();
+ Dictionary numericGraphemeMap = new Dictionary();
+
+ // Next, iterate though all assigned code points, populating
+ // the category casing & numeric grapheme maps. Also put the
+ // data into the the DataTable structure, which will compute
+ // the tiered offset tables.
+
+ DataTable categoryCasingTable = new DataTable();
+ DataTable numericGraphemeTable = new DataTable();
+
+ for (int i = 0; i <= 0x10_FFFF; i++)
+ {
+ CodePoint thisCodePoint = UnicodeData.GetData(i);
+
+ CategoryCasingInfo categoryCasingInfo = new CategoryCasingInfo(thisCodePoint);
+ if (!categoryCasingMap.TryGetValue(categoryCasingInfo, out byte cciValue))
+ {
+ cciValue = (byte)categoryCasingMap.Count;
+ categoryCasingMap[categoryCasingInfo] = cciValue;
+ }
+ categoryCasingTable.AddData((uint)i, cciValue);
+
+ NumericGraphemeInfo numericGraphemeInfo = new NumericGraphemeInfo(thisCodePoint);
+ if (!numericGraphemeMap.TryGetValue(numericGraphemeInfo, out byte ngiValue))
+ {
+ ngiValue = (byte)numericGraphemeMap.Count;
+ numericGraphemeMap[numericGraphemeInfo] = ngiValue;
+ }
+ numericGraphemeTable.AddData((uint)i, ngiValue);
+ }
+
+ // Did anything overflow?
+
+ Console.WriteLine($"CategoryCasingMap contains {categoryCasingMap.Count} entries.");
+ if (categoryCasingMap.Count > 256)
+ {
+ throw new Exception("CategoryCasingMap exceeds max count of 256 entries!");
+ }
+
+ Console.WriteLine($"NumericGraphemeMap contains {numericGraphemeMap.Count} entries.");
+ if (numericGraphemeMap.Count > 256)
+ {
+ throw new Exception("NumericGraphemeMap exceeds max count of 256 entries!");
+ }
+
+ Console.WriteLine();
+
+ // Choose default ratios for the data tables we'll be generating.
+
+ TableLevels categoryCasingTableLevelBits = new TableLevels(5, 4);
+ TableLevels numericGraphemeTableLevelBits = new TableLevels(5, 4);
+
+ // Now generate the tables.
+
+ categoryCasingTable.GenerateTable("CategoryCasingTable", categoryCasingTableLevelBits.Level2Bits, categoryCasingTableLevelBits.Level3Bits);
+ numericGraphemeTable.GenerateTable("NumericGraphemeTable", numericGraphemeTableLevelBits.Level2Bits, numericGraphemeTableLevelBits.Level3Bits);
+
+ // If you want to see if a different ratio would have better compression
+ // statistics, uncomment the lines below and re-run the application.
+ // categoryCasingTable.CalculateTableVariants();
+ // numericGraphemeTable.CalculateTableVariants();
+
+ // Now generate the C# source file.
+
using (StreamWriter file = File.CreateText(SOURCE_NAME))
{
file.Write("// Licensed to the .NET Foundation under one or more agreements.\n");
file.Write("// The .NET Foundation licenses this file to you under the MIT license.\n");
file.Write("// See the LICENSE file in the project root for more information.\n\n");
+ file.Write("using System.Diagnostics;\n\n");
+
file.Write("namespace System.Globalization\n");
file.Write("{\n");
file.Write(" public static partial class CharUnicodeInfo\n {\n");
file.Write(" // THE FOLLOWING DATA IS AUTO GENERATED BY GenUnicodeProp program UNDER THE TOOLS FOLDER\n");
- file.Write(" // PLEASE DON'T MODIFY BY HAND\n\n\n");
-
- file.Write(" // 11:5:4 index table of the Unicode category data.");
- PrintSourceIndexArray("CategoryLevel1Index", categoriesIndexTable, file);
-
- PrintValueArray("CategoriesValue", categoriesValueTable, file);
+ file.Write(" // PLEASE DON'T MODIFY BY HAND\n");
- file.Write("\n // 12:4:4 index table of the Unicode numeric data.");
- PrintSourceIndexArray("NumericLevel1Index", numericIndexTable, file);
+ PrintAssertTableLevelsBitCountRoutine("CategoryCasing", file, categoryCasingTableLevelBits);
- file.Write("\n // Every item contains the value for numeric value.");
- PrintValueArray("NumericValues", numericValueTable, file);
+ file.Write($"\n // {categoryCasingTableLevelBits} index table of the Unicode category & casing data.");
+ PrintSourceIndexArray("CategoryCasingLevel1Index", categoryCasingTable, file);
- PrintValueArray("DigitValues", digitValueTable, file);
+ file.Write("\n // Contains Unicode category & bidi class information");
+ PrintValueArray("CategoriesValues", categoryCasingMap, CategoryCasingInfo.ToCategoryBytes, file);
- file.Write("\n }\n}");
- }
- }
+ if (IncludeCasingData)
+ {
+ // Only write out the casing data if we have been asked to do so.
- private static bool Verbose;
+ file.Write("\n // Contains simple culture-invariant uppercase mappings");
+ PrintValueArray("UppercaseValues", categoryCasingMap, CategoryCasingInfo.ToUpperBytes, file);
- private const string SOURCE_NAME = "CharUnicodeInfoData.cs";
+ file.Write("\n // Contains simple culture-invariant lowercase mappings");
+ PrintValueArray("LowercaseValues", categoryCasingMap, CategoryCasingInfo.ToLowerBytes, file);
- private static readonly Dictionary UnicodeCategoryMap = new Dictionary
- {
- ["Lu"] = (byte)UnicodeCategory.UppercaseLetter,
- ["Ll"] = (byte)UnicodeCategory.LowercaseLetter,
- ["Lt"] = (byte)UnicodeCategory.TitlecaseLetter,
- ["Lm"] = (byte)UnicodeCategory.ModifierLetter,
- ["Lo"] = (byte)UnicodeCategory.OtherLetter,
- ["Mn"] = (byte)UnicodeCategory.NonSpacingMark,
- ["Mc"] = (byte)UnicodeCategory.SpacingCombiningMark,
- ["Me"] = (byte)UnicodeCategory.EnclosingMark,
- ["Nd"] = (byte)UnicodeCategory.DecimalDigitNumber,
- ["Nl"] = (byte)UnicodeCategory.LetterNumber,
- ["No"] = (byte)UnicodeCategory.OtherNumber,
- ["Zs"] = (byte)UnicodeCategory.SpaceSeparator,
- ["Zl"] = (byte)UnicodeCategory.LineSeparator,
- ["Zp"] = (byte)UnicodeCategory.ParagraphSeparator,
- ["Cc"] = (byte)UnicodeCategory.Control,
- ["Cf"] = (byte)UnicodeCategory.Format,
- ["Cs"] = (byte)UnicodeCategory.Surrogate,
- ["Co"] = (byte)UnicodeCategory.PrivateUse,
- ["Pc"] = (byte)UnicodeCategory.ConnectorPunctuation,
- ["Pd"] = (byte)UnicodeCategory.DashPunctuation,
- ["Ps"] = (byte)UnicodeCategory.OpenPunctuation,
- ["Pe"] = (byte)UnicodeCategory.ClosePunctuation,
- ["Pi"] = (byte)UnicodeCategory.InitialQuotePunctuation,
- ["Pf"] = (byte)UnicodeCategory.FinalQuotePunctuation,
- ["Po"] = (byte)UnicodeCategory.OtherPunctuation,
- ["Sm"] = (byte)UnicodeCategory.MathSymbol,
- ["Sc"] = (byte)UnicodeCategory.CurrencySymbol,
- ["Sk"] = (byte)UnicodeCategory.ModifierSymbol,
- ["So"] = (byte)UnicodeCategory.OtherSymbol,
- ["Cn"] = (byte)UnicodeCategory.OtherNotAssigned,
- };
-
- ///
- /// Map BiDi symbols in UnicodeData.txt to their numeric values stored in the output table.
- ///
- private static readonly Dictionary BiDiCategory = new Dictionary
- {
- ["L"] = 0, // Left-to-Right
- ["LRE"] = 1, // Left-to-Right Embedding
- ["LRO"] = 2, // Left-to-Right Override
- ["R"] = 3, // Right-to-Left
- ["AL"] = 4, // Right-to-Left Arabic
- ["RLE"] = 5, // Right-to-Left Embedding
- ["RLO"] = 6, // Right-to-Left Override
- ["PDF"] = 7, // Pop Directional Format
- ["EN"] = 8, // European Number
- ["ES"] = 9, // European Number Separator
- ["ET"] = 10, // European Number Terminator
- ["AN"] = 11, // Arabic Number
- ["CS"] = 12, // Common Number Separator
- ["NSM"] = 13, // Non-Spacing Mark
- ["BN"] = 14, // Boundary Neutral
- ["B"] = 15, // Paragraph Separator
- ["S"] = 16, // Segment Separator
- ["WS"] = 17, // Whitespace
- ["ON"] = 18, // Other Neutrals
- ["LRI"] = 19, // LeftToRightIsolate
- ["RLI"] = 20, // RightToLeftIsolate
- ["FSI"] = 21, // FirstStrongIsolate
- ["PDI"] = 22, // PopDirectionIsolate
- };
-
- // Store the current combinations of categories (Unicode category, BiDi category)
- private static readonly Dictionary CategoryValues = new Dictionary();
-
- private static readonly Dictionary NumberValues = new Dictionary();
-
- ///
- /// Check if we need to add a new item in the categoriesValueTable. If yes,
- /// add one item and return the new item number. Otherwise, return the existing
- /// item number.
- ///
- ///
- /// The combination of Unicode category and BiDi category.
- /// They should use the original form in UnicodeData.txt (such as "Cn" for not assigned and "L" for Left-To-Right")
- /// and are separated by a comma.
- /// The item number in the categoriesValueTable
- private static byte GetCategoryValueItem(FlatDataTable categoriesValueTable, string allCategoryValues)
- {
- if (!CategoryValues.TryGetValue(allCategoryValues, out var categoryItem))
- {
- // This combination of Unicode category and BiDi category has not shown up before.
- if (CategoryValues.Count >= 255)
- throw new InvalidOperationException("The possible number of values exceeds 255.");
-
- // Get the current element count of the hash table and update the category item
- categoryItem = (byte)CategoryValues.Count;
- CategoryValues.Add(allCategoryValues, categoryItem);
- // Add the category values.
- categoriesValueTable.AddData(categoryItem, allCategoryValues);
- }
- return categoryItem;
- }
+ file.Write("\n // Contains simple culture-invariant titlecase mappings");
+ PrintValueArray("TitlecaseValues", categoryCasingMap, CategoryCasingInfo.ToTitleBytes, file);
- ///
- /// Read UnicodeData.txt and call DataTable.AddData() to add values for codepoints.
- ///
- private static void ReadSourceFile(string sourceFileName, DataTable categoriesIndexTable, FlatDataTable categoriesValueTable, DataTable numericIndexTable, FlatDataTable numericValueTable, FlatDataTable digitValueTable)
- {
- var lineCount = 0; // The line count
- var codePointCount = 0; // The count of the total characters in the file.
-
- Console.Write($"Read {sourceFileName}");
-
- // Field Name in UnicodeData.txt
- // 0 Code value
- // 1 Character name
- // 2 General Category
- //
- // 3 Canonical Combining Classes
- // 4 Bidirectional Category
- // 5 Character Decomposition Mapping
- // 6 Decimal digit value
- // 7 Digit value
- // 8 Numeric value
- // 9 Mirrored
- // 10 Unicode 1.0 Name
- // 11 10646 comment field
- // 12 Uppercase Mapping
- // 13 Lowercase Mapping
- // 14 Titlecase Mapping
-
- using (StreamReader sourceFile = File.OpenText(sourceFileName))
- while (sourceFile.ReadLine() is string line)
- {
- var fields = line.Split(';');
- var code = uint.Parse(fields[0], NumberStyles.HexNumber);
- var comments = fields[1];
- var category = fields[2];
-
- var bidiCategory = fields[4];
- var decimalDigitValue = fields[6];
- var digitValue = fields[7];
- var numericValue = fields[8];
-
- var allCategoryValues = category + "," + bidiCategory;
- var allDigitValue = (decimalDigitValue == "" ? "255" : decimalDigitValue) + "," + (digitValue == "" ? "255" : digitValue);
- var allNumValues = numericValue == "" ? "-1" : numericValue;
- var allValues = allNumValues + "," + allDigitValue;
-
- if (Verbose)
- {
- Console.WriteLine($"[{code:X4}]- Cat: [{category}], BiDi Category: [{bidiCategory}], Numeric: [{numericValue}], Comments: [{comments}]");
- }
-
- if (!NumberValues.TryGetValue(allValues, out var numItem))
- {
- if (NumberValues.Count >= 255)
- throw new InvalidOperationException("The possible number of values exceeds 255.");
- // Get the current element count of the hash table
- numItem = (byte)NumberValues.Count;
- NumberValues[allValues] = numItem;
- numericValueTable.AddData(numItem, allNumValues);
- digitValueTable.AddData(numItem, allDigitValue);
- }
-
- var categoryItem = GetCategoryValueItem(categoriesValueTable, allCategoryValues);
-
- if (comments[0] == '<' && comments.EndsWith(", First>", StringComparison.Ordinal))
- {
- if (Verbose)
- {
- Console.WriteLine($"Range start: {code:X4} [{category}] [{comments}]");
- }
-
- // Read the next line to get the end of the range.
- var endFields = sourceFile.ReadLine().Split(';');
- var codeEndRange = uint.Parse(endFields[0], NumberStyles.HexNumber);
- var valueEndRange = endFields[2];
- var commentsEndRange = endFields[1];
-
- if (Verbose)
- {
- Console.WriteLine($"Range end: {codeEndRange:X4} [{valueEndRange}] [{commentsEndRange}]");
- }
-
- if (category != valueEndRange)
- {
- Console.WriteLine("Different categories in the beginning of the range and the end of the range");
- Environment.Exit(1);
- }
-
- // Add data for a range of code points
- for (var i = code; i <= codeEndRange; i++)
- {
- categoriesIndexTable.AddData(i, categoryItem);
- numericIndexTable.AddData(i, numItem);
- codePointCount++;
- if (Verbose)
- {
- Console.WriteLine($"Read: {i:X8} [{allCategoryValues}]");
- }
- }
- }
- else
- {
- // Add data for a single code point.
- categoriesIndexTable.AddData(code, categoryItem);
- numericIndexTable.AddData(code, numItem);
- codePointCount++;
- if (Verbose)
- {
- Console.WriteLine($"Read: {code:X8} [{allCategoryValues}]");
- }
- }
- lineCount++;
- if (lineCount % 256 == 0)
- {
- Console.Write('.');
- }
+ file.Write("\n // Contains simple culture-invariant case fold mappings");
+ PrintValueArray("CaseFoldValues", categoryCasingMap, CategoryCasingInfo.ToCaseFoldBytes, file);
}
- Console.WriteLine();
- Console.WriteLine();
- Console.WriteLine($" Total lines in the file: {lineCount}");
- Console.WriteLine($" Total characters: {codePointCount}");
-
- var allValueCount = CategoryValues.Count;
- Console.WriteLine($" Total possible categories values: {allValueCount + 1}. Maximum allowed: 256");
-
- allValueCount = NumberValues.Count;
- Console.WriteLine($" Total possible number values: {allValueCount + 1}. Maximum allowed: 256");
- Console.WriteLine($" Finish reading {sourceFileName}.");
- }
-
- private static byte[] GetCategoriesValueBytes(string value)
- {
- if (Verbose)
- Console.WriteLine($"[{value}]");
+ PrintAssertTableLevelsBitCountRoutine("NumericGrapheme", file, numericGraphemeTableLevelBits);
- var values = value.Split(',');
- var unicodeCategoryValue = values[0];
- var bidiCategoryValue = values[1];
+ file.Write($"\n // {numericGraphemeTableLevelBits} index table of the Unicode numeric & text segmentation data.");
+ PrintSourceIndexArray("NumericGraphemeLevel1Index", numericGraphemeTable, file);
- return new[] { GetUnicodeCategoryValue(unicodeCategoryValue), GetBiDiCategoryValue(bidiCategoryValue) };
- }
+ file.Write("\n // Contains decimal digit values in high nibble; digit values in low nibble");
+ PrintValueArray("DigitValues", numericGraphemeMap, NumericGraphemeInfo.ToDigitBytes, file);
- private static byte[] GetNumericValueBytes(string value)
- {
- double d;
- var i = value.IndexOf('/');
- if (i < 0)
- d = double.Parse(value, CultureInfo.InvariantCulture);
- else
- d = double.Parse(value.Substring(0, i), CultureInfo.InvariantCulture) / double.Parse(value.Substring(i + 1), CultureInfo.InvariantCulture);
- return BitConverter.GetBytes(d);
- }
+ file.Write("\n // Contains numeric values");
+ PrintValueArray("NumericValues", numericGraphemeMap, NumericGraphemeInfo.ToNumericBytes, file);
- private static byte[] GetDigitValueBytes(string value)
- {
- if (Verbose)
- Console.WriteLine($"[{value}]");
+ file.Write("\n // Contains grapheme cluster segmentation values");
+ PrintValueArray("GraphemeSegmentationValues", numericGraphemeMap, NumericGraphemeInfo.ToGraphemeBytes, file);
- var values = value.Split(',');
- var decimalDigitValue = values[0];
- var digitValue = values[1];
+ file.Write("\n }\n}\n");
+ }
- return new[] { byte.Parse(decimalDigitValue), byte.Parse(digitValue) };
- }
+ // Quick fixup: Replace \n with \r\n on Windows.
- ///
- /// Map a Unicode category symbol in UnicodeData.txt to a numeric value
- ///
- /// A two-letter abbreviation of Unicode category
- /// A numeric value for the corresponding two-letter Unicode category
- private static byte GetUnicodeCategoryValue(string str)
- {
- return UnicodeCategoryMap.TryGetValue(str, out var v) ? v : throw new ArgumentException($"The str [{str}] is not a valid two-letter Unicode category.");
- }
+ if (Environment.NewLine != "\n")
+ {
+ File.WriteAllText(SOURCE_NAME, File.ReadAllText(SOURCE_NAME).Replace("\n", Environment.NewLine));
+ }
- private static byte GetBiDiCategoryValue(string str)
- {
- return BiDiCategory.TryGetValue(str, out var v) ? v : throw new ArgumentException($"The str [{str}] is not a valid BiDi category.");
+ Console.WriteLine("Completed!");
}
private static void PrintSourceIndexArray(string tableName, DataTable d, StreamWriter file)
@@ -369,10 +176,31 @@ private static void PrintSourceIndexArray(string tableName, DataTable d, StreamW
PrintByteArray(tableName.Replace('1', '3'), file, levels[2]);
}
- private static void PrintValueArray(string tableName, FlatDataTable d, StreamWriter file)
+ private static void PrintValueArray(string tableName, Dictionary d, Func getBytesCallback, StreamWriter file)
{
Console.WriteLine(" ******************************** .");
- PrintByteArray(tableName, file, d.GetBytesFlat());
+
+ // Create reverse mapping of byte -> T,
+ // then dump each T to the response (as binary).
+
+ byte highestByteSeen = 0;
+ Dictionary reverseMap = new Dictionary();
+ foreach (var entry in d)
+ {
+ reverseMap.Add(entry.Value, entry.Key);
+ if (entry.Value > highestByteSeen)
+ {
+ highestByteSeen = entry.Value;
+ }
+ }
+
+ List binaryOutput = new List();
+ for (int i = 0; i <= highestByteSeen; i++)
+ {
+ binaryOutput.AddRange(getBytesCallback(reverseMap[(byte)i]));
+ }
+
+ PrintByteArray(tableName, file, binaryOutput.ToArray());
}
private static void PrintByteArray(string tableName, StreamWriter file, byte[] str)
@@ -386,5 +214,18 @@ private static void PrintByteArray(string tableName, StreamWriter file, byte[] s
}
file.Write("\n };\n");
}
+
+ private static void PrintAssertTableLevelsBitCountRoutine(string tableName, StreamWriter file, TableLevels expectedLevels)
+ {
+ file.Write("\n");
+ file.Write(" [Conditional(\"DEBUG\")]\n");
+ file.Write($" private static void Assert{tableName}TableLevels(int level1BitCount, int level2BitCount, int level3BitCount)\n");
+ file.Write(" {\n");
+ file.Write(" // Ensures that the caller expects the same L1:L2:L3 count as the actual backing data.\n");
+ file.Write($" Debug.Assert(level1BitCount == {expectedLevels.Level1Bits}, \"Unexpected level 1 bit count.\");\n");
+ file.Write($" Debug.Assert(level2BitCount == {expectedLevels.Level2Bits}, \"Unexpected level 2 bit count.\");\n");
+ file.Write($" Debug.Assert(level3BitCount == {expectedLevels.Level3Bits}, \"Unexpected level 3 bit count.\");\n");
+ file.Write(" }\n");
+ }
}
}
diff --git a/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/Readme.md b/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/Readme.md
index 1fc856e33fb42e..5033103117ad53 100644
--- a/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/Readme.md
+++ b/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/Readme.md
@@ -1,3 +1,22 @@
This folder contains the program used to generate the Unicode character categories file CharUnicodeInfoData.cs.
-The required UnicodeData.txt file can be obtained from https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
-To generate the file execute: `dotnet run`
+
+Before running this tool, ensure the following are all in sync:
+
+ - The package at https://github.com/dotnet/runtime-assets/tree/master/src/System.Private.Runtime.UnicodeData contains
+ the up-to-date Unicode data you want to process.
+
+ - The element in $(REPOROOT)\eng\Versions.props contains the correct version
+ of the package mentioned above.
+
+ - The element in .\GenUnicodeProp.csproj contains the UCD version of the files you wish to process.
+
+Once this has been configured, from this directory, invoke:
+
+> `dotnet run`
+
+If you want to include casing data (simple case mappings + case folding) in the generated file, execute:
+
+> `dotnet run -- -IncludeCasingData`
+
+Then move the generated CharUnicodeInfoData.cs file to $(LIBRARIESROOT)\System.Private.CoreLib\src\System\Globalization,
+overwriting the file in that directory, and commit it. DO NOT commit the file to this directory.
diff --git a/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/StrongBidiCategory.cs b/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/StrongBidiCategory.cs
new file mode 100644
index 00000000000000..e8c90c46b06b57
--- /dev/null
+++ b/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/StrongBidiCategory.cs
@@ -0,0 +1,17 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace GenUnicodeProp
+{
+ // Corresponds to the "strong" categories from https://www.unicode.org/reports/tr44/#Bidi_Class_Values.
+ // For our purposes, each code point is strongly left-to-right ("L"), strongly right-to-left ("R", "AL"),
+ // or other (all remaining code points). This is only used internally by IDN processing, and since our
+ // IDN processing logic only cares about "strong" values we don't carry the rest of the data.
+ internal enum StrongBidiCategory
+ {
+ Other = 0,
+ StrongLeftToRight = 1,
+ StrongRightToLeft = 2,
+ }
+}
diff --git a/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/TableLevels.cs b/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/TableLevels.cs
new file mode 100644
index 00000000000000..66a2693078765e
--- /dev/null
+++ b/src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/TableLevels.cs
@@ -0,0 +1,32 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+
+namespace GenUnicodeProp
+{
+ internal class TableLevels
+ {
+ public readonly int Level1Bits;
+ public readonly int Level2Bits;
+ public readonly int Level3Bits;
+
+ public TableLevels(int level2Bits, int level3Bits)
+ {
+ if ((uint)level2Bits > 20) { throw new ArgumentOutOfRangeException(nameof(level2Bits)); }
+ if ((uint)level3Bits > 20) { throw new ArgumentOutOfRangeException(nameof(level3Bits)); }
+
+ Level1Bits = 20 - level2Bits - level3Bits;
+ if (Level1Bits < 0) { throw new Exception("Level2Bits + Level3Bits cannot exceed 20."); }
+
+ Level2Bits = level2Bits;
+ Level3Bits = level3Bits;
+ }
+
+ public override string ToString()
+ {
+ return FormattableString.Invariant($"{Level1Bits}:{Level2Bits}:{Level3Bits}");
+ }
+ }
+}
diff --git a/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/Configurations.props b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/Configurations.props
new file mode 100644
index 00000000000000..a9976accf42453
--- /dev/null
+++ b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/Configurations.props
@@ -0,0 +1,7 @@
+
+
+
+ $(NetCoreAppCurrent);
+
+
+
diff --git a/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/CoreFx.Private.TestUtilities.Unicode.csproj b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/CoreFx.Private.TestUtilities.Unicode.csproj
new file mode 100644
index 00000000000000..e1b316f6c4b7e8
--- /dev/null
+++ b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/CoreFx.Private.TestUtilities.Unicode.csproj
@@ -0,0 +1,55 @@
+
+
+ true
+ $(NetCoreAppCurrent)-Debug;$(NetCoreAppCurrent)-Release
+ 12.1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ UnicodeData\CaseFolding.txt
+ CaseFolding.txt
+
+
+ UnicodeData\PropList.txt
+ PropList.txt
+
+
+ UnicodeData\UnicodeData.txt
+ UnicodeData.txt
+
+
+ UnicodeData\GraphemeBreakProperty.txt
+ GraphemeBreakProperty.txt
+
+
+ UnicodeData\DerivedBidiClass.txt
+ DerivedBidiClass.txt
+
+
+ UnicodeData\DerivedName.txt
+ DerivedName.txt
+
+
+ UnicodeData\emoji-data.txt
+ emoji-data.txt
+
+
+
+
+
+
+
+
+
diff --git a/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/BidiClass.cs b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/BidiClass.cs
new file mode 100644
index 00000000000000..07d389813dcbc6
--- /dev/null
+++ b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/BidiClass.cs
@@ -0,0 +1,40 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace System.Text.Unicode
+{
+ ///
+ /// Bidi class values from UAX#44.
+ ///
+ ///
+ /// See https://www.unicode.org/reports/tr44/#BC_Values_Table
+ /// and https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt (bc).
+ ///
+ public enum BidiClass
+ {
+ Arabic_Letter,
+ Arabic_Number,
+ Paragraph_Separator,
+ Boundary_Neutral,
+ Common_Separator,
+ European_Number,
+ European_Separator,
+ European_Terminator,
+ First_Strong_Isolate,
+ Left_To_Right,
+ Left_To_Right_Embedding,
+ Left_To_Right_Isolate,
+ Left_To_Right_Override,
+ Nonspacing_Mark,
+ Other_Neutral,
+ Pop_Directional_Format,
+ Pop_Directional_Isolate,
+ Right_To_Left,
+ Right_To_Left_Embedding,
+ Right_To_Left_Isolate,
+ Right_To_Left_Override,
+ Segment_Separator,
+ White_Space,
+ }
+}
diff --git a/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/CodePoint.cs b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/CodePoint.cs
new file mode 100644
index 00000000000000..6f8300a594f275
--- /dev/null
+++ b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/CodePoint.cs
@@ -0,0 +1,208 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Globalization;
+using Xunit;
+
+namespace System.Text.Unicode
+{
+ ///
+ /// Represents a Unicode code point (U+0000..U+10FFFF).
+ ///
+ public sealed class CodePoint : IEquatable
+ {
+ internal CodePoint(int value, ParsedUnicodeData parsedData)
+ {
+ if ((uint)value > 0x10_FFFF)
+ {
+ throw new ArgumentOutOfRangeException(
+ message: FormattableString.Invariant($"Value U+{(uint)value:X4} is not a valid code point."),
+ paramName: nameof(value));
+ }
+
+ Assert.NotNull(parsedData);
+
+ Value = value;
+
+ if (parsedData.DerivedBidiClassData.TryGetValue(value, out BidiClass bidiClass))
+ {
+ BidiClass = bidiClass;
+ }
+
+ if (parsedData.PropListData.TryGetValue(value, out CodePointFlags flags))
+ {
+ Flags = flags;
+ }
+
+ // All code points by default case convert to themselves.
+
+ SimpleLowercaseMapping = value;
+ SimpleTitlecaseMapping = value;
+ SimpleUppercaseMapping = value;
+
+ if (parsedData.UnicodeDataData.TryGetValue(value, out UnicodeDataFileEntry entry))
+ {
+ GeneralCategory = entry.GeneralCategory;
+ DecimalDigitValue = entry.DecimalDigitValue;
+ DigitValue = entry.DigitValue;
+ Name = entry.Name;
+ NumericValue = entry.NumericValue;
+ SimpleLowercaseMapping = entry.SimpleLowercaseMapping;
+ SimpleTitlecaseMapping = entry.SimpleTitlecaseMapping;
+ SimpleUppercaseMapping = entry.SimpleUppercaseMapping;
+ }
+
+ // All code points by default case fold to themselves.
+
+ SimpleCaseFoldMapping = value;
+
+ if (parsedData.CaseFoldingData.TryGetValue(value, out int caseFoldsTo))
+ {
+ SimpleCaseFoldMapping = caseFoldsTo;
+ }
+
+ // Can we get a better name for this code point?
+
+ if (parsedData.DerivedNameData.TryGetValue(value, out string preferredName))
+ {
+ Name = preferredName;
+ }
+
+ // Finally, get the grapheme cluster break value.
+
+ if (parsedData.GraphemeBreakPropertyData.TryGetValue(value, out GraphemeClusterBreakProperty graphemeProperty))
+ {
+ GraphemeClusterBreakProperty = graphemeProperty;
+ }
+ }
+
+ ///
+ /// The bidi class of this code point. Note that even unassigned code points can
+ /// have a non-default bidi class.
+ ///
+ ///
+ /// See https://www.unicode.org/reports/tr44/#Bidi_Class.
+ ///
+ public BidiClass BidiClass { get; } = BidiClass.Left_To_Right; // default is "L" (strong left-to-right)
+
+ ///
+ /// The decimal digit value (0..9) of this code point, or -1 if this code point
+ /// does not have a decimal digit value.
+ ///
+ ///
+ /// See https://www.unicode.org/reports/tr44/#Numeric_Value, field (6).
+ ///
+ public int DecimalDigitValue { get; } = -1; // default is "not a decimal digit"
+
+ ///
+ /// The digit value (0..9) of this code point, or -1 if this code point
+ /// does not have a digit value.
+ ///
+ ///
+ /// See https://www.unicode.org/reports/tr44/#Numeric_Value, field (7).
+ ///
+ public int DigitValue { get; } = -1; // default is "not a digit"
+
+ ///
+ /// Any flags associated with this code point, such as "is white space?"
+ ///
+ ///
+ /// See https://www.unicode.org/reports/tr44/#PropList.txt.
+ ///
+ public CodePointFlags Flags { get; } = default; // default is "no flags"
+
+ ///
+ /// The general Unicode category of this code point.
+ ///
+ ///
+ /// See https://www.unicode.org/reports/tr44/#UnicodeData.txt.
+ ///
+ public UnicodeCategory GeneralCategory { get; } = UnicodeCategory.OtherNotAssigned; // default is "Unassigned"
+
+ ///
+ /// The grapheme cluster break property of this code point.
+ ///
+ ///
+ /// See https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values.
+ ///
+ public GraphemeClusterBreakProperty GraphemeClusterBreakProperty { get; } = GraphemeClusterBreakProperty.Other; // default is "Other"
+
+ ///
+ /// The name of this code point.
+ ///
+ ///
+ /// See https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedName.txt.
+ ///
+ public string Name { get; } = "";
+
+ ///
+ /// The numeric value of this code point, or -1 if this code point
+ /// does not have a numeric value.
+ ///
+ ///
+ /// See https://www.unicode.org/reports/tr44/#Numeric_Value, field (8).
+ ///
+ public double NumericValue { get; } = -1; // default is "not a numeric value"
+
+ ///
+ /// The code point that results from performing a simple case fold mapping of this code point.
+ ///
+ ///
+ /// See https://www.unicode.org/reports/tr44/#CaseFolding.txt
+ /// and https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt.
+ ///
+ public int SimpleCaseFoldMapping { get; }
+
+ ///
+ /// The code point that results from performing a simple lowercase mapping of this code point.
+ ///
+ ///
+ /// See https://www.unicode.org/reports/tr44/#Simple_Lowercase_Mapping.
+ ///
+ public int SimpleLowercaseMapping { get; }
+
+ ///
+ /// The code point that results from performing a simple titlecase mapping of this code point.
+ ///
+ ///
+ /// See https://www.unicode.org/reports/tr44/#Simple_Titlecase_Mapping.
+ ///
+ public int SimpleTitlecaseMapping { get; }
+
+ ///
+ /// The code point that results from performing a simple uppercase mapping of this code point.
+ ///
+ ///
+ /// See https://www.unicode.org/reports/tr44/#Simple_Uppercase_Mapping.
+ ///
+ public int SimpleUppercaseMapping { get; }
+
+ ///
+ /// The value (0000..10FFFF) of this code point.
+ ///
+ public int Value { get; }
+
+ public override bool Equals(object obj) => Equals(obj as CodePoint);
+
+ public bool Equals(CodePoint obj)
+ {
+ if (obj is null)
+ {
+ return false;
+ }
+
+ return this.Value == obj.Value;
+ }
+
+ public override int GetHashCode()
+ {
+ return Value;
+ }
+
+ public override string ToString()
+ {
+ return FormattableString.Invariant($"U+{(uint)Value:X4} {Name}");
+ }
+ }
+}
diff --git a/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/CodePointFlags.cs b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/CodePointFlags.cs
new file mode 100644
index 00000000000000..63d0a27846b289
--- /dev/null
+++ b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/CodePointFlags.cs
@@ -0,0 +1,52 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace System.Text.Unicode
+{
+ ///
+ /// Code point properties from UAX#44.
+ ///
+ ///
+ /// See https://www.unicode.org/reports/tr44/#PropList.txt
+ /// and https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt.
+ ///
+ [Flags]
+ public enum CodePointFlags : ulong
+ {
+ ASCII_Hex_Digit = 1ul << 0,
+ Bidi_Control = 1ul << 1,
+ Dash = 1ul << 2,
+ Deprecated = 1ul << 3,
+ Diacritic = 1ul << 4,
+ Extender = 1ul << 5,
+ Hex_Digit = 1ul << 6,
+ Hyphen = 1ul << 7,
+ Ideographic = 1ul << 8,
+ IDS_Binary_Operator = 1ul << 9,
+ IDS_Trinary_Operator = 1ul << 10,
+ Join_Control = 1ul << 11,
+ Logical_Order_Exception = 1ul << 12,
+ Noncharacter_Code_Point = 1ul << 13,
+ Other_Alphabetic = 1ul << 14,
+ Other_Default_Ignorable_Code_Point = 1ul << 15,
+ Other_Grapheme_Extend = 1ul << 16,
+ Other_ID_Continue = 1ul << 17,
+ Other_ID_Start = 1ul << 18,
+ Other_Lowercase = 1ul << 19,
+ Other_Math = 1ul << 20,
+ Other_Uppercase = 1ul << 21,
+ Pattern_Syntax = 1ul << 22,
+ Pattern_White_Space = 1ul << 23,
+ Prepended_Concatenation_Mark = 1ul << 24,
+ Quotation_Mark = 1ul << 25,
+ Radical = 1ul << 26,
+ Regional_Indicator = 1ul << 27,
+ Sentence_Terminal = 1ul << 28,
+ Soft_Dotted = 1ul << 29,
+ Terminal_Punctuation = 1ul << 30,
+ Unified_Ideograph = 1ul << 31,
+ Variation_Selector = 1ul << 32,
+ White_Space = 1ul << 33,
+ }
+}
diff --git a/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/GraphemeClusterBreakProperty.cs b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/GraphemeClusterBreakProperty.cs
new file mode 100644
index 00000000000000..5ebcbde0fb9be8
--- /dev/null
+++ b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/GraphemeClusterBreakProperty.cs
@@ -0,0 +1,32 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace System.Text.Unicode
+{
+ ///
+ /// Grapheme cluster break property values from UAX#29.
+ ///
+ ///
+ /// See https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
+ /// and https://www.unicode.org/Public/emoji/12.1/emoji-data.txt.
+ ///
+ public enum GraphemeClusterBreakProperty
+ {
+ Other,
+ CR,
+ LF,
+ Control,
+ Extend,
+ ZWJ,
+ Regional_Indicator,
+ Prepend,
+ SpacingMark,
+ L,
+ V,
+ T,
+ LV,
+ LVT,
+ Extended_Pictographic,
+ }
+}
diff --git a/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/ParsedUnicodeData.cs b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/ParsedUnicodeData.cs
new file mode 100644
index 00000000000000..12991652135a6e
--- /dev/null
+++ b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/ParsedUnicodeData.cs
@@ -0,0 +1,270 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Collections.Generic;
+using System.Globalization;
+using System.IO;
+using Xunit;
+
+namespace System.Text.Unicode
+{
+ internal sealed class ParsedUnicodeData
+ {
+ // Mappings from https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt (bc).
+ private static readonly Dictionary BidiClassMap = new Dictionary()
+ {
+ ["AL"] = BidiClass.Arabic_Letter,
+ ["AN"] = BidiClass.Arabic_Number,
+ ["B"] = BidiClass.Paragraph_Separator,
+ ["BN"] = BidiClass.Boundary_Neutral,
+ ["CS"] = BidiClass.Common_Separator,
+ ["EN"] = BidiClass.European_Number,
+ ["ES"] = BidiClass.European_Separator,
+ ["ET"] = BidiClass.European_Terminator,
+ ["FSI"] = BidiClass.First_Strong_Isolate,
+ ["L"] = BidiClass.Left_To_Right,
+ ["LRE"] = BidiClass.Left_To_Right_Embedding,
+ ["LRI"] = BidiClass.Left_To_Right_Isolate,
+ ["LRO"] = BidiClass.Left_To_Right_Override,
+ ["NSM"] = BidiClass.Nonspacing_Mark,
+ ["ON"] = BidiClass.Other_Neutral,
+ ["PDF"] = BidiClass.Pop_Directional_Format,
+ ["PDI"] = BidiClass.Pop_Directional_Isolate,
+ ["R"] = BidiClass.Right_To_Left,
+ ["RLE"] = BidiClass.Right_To_Left_Embedding,
+ ["RLI"] = BidiClass.Right_To_Left_Isolate,
+ ["RLO"] = BidiClass.Right_To_Left_Override,
+ ["S"] = BidiClass.Segment_Separator,
+ ["WS"] = BidiClass.White_Space,
+ };
+
+ internal readonly Dictionary CaseFoldingData;
+ internal readonly Dictionary DerivedBidiClassData;
+ internal readonly Dictionary DerivedNameData;
+ internal readonly Dictionary GraphemeBreakPropertyData;
+ internal readonly Dictionary PropListData;
+ internal readonly Dictionary UnicodeDataData;
+
+ public ParsedUnicodeData()
+ {
+ CaseFoldingData = ProcessCaseFoldingFile();
+ DerivedBidiClassData = ProcessDerivedBidiClassFile();
+ DerivedNameData = ProcessDerivedNameFile();
+ GraphemeBreakPropertyData = ProcessGraphemeClusterBreakAndEmojiDataFiles();
+ PropListData = ProcessPropListFile();
+ UnicodeDataData = ProcessUnicodeDataFile();
+ }
+
+ ///
+ /// Reads CaseFolding.txt and parses each entry in that file.
+ ///
+ private static Dictionary ProcessCaseFoldingFile()
+ {
+ using Stream stream = Resources.OpenResource(Resources.CaseFolding);
+ using StreamReader reader = new StreamReader(stream);
+
+ Dictionary dict = new Dictionary();
+
+ string thisLine;
+ while ((thisLine = reader.ReadLine()) != null)
+ {
+ // Ignore blank lines or comment lines
+
+ if (string.IsNullOrEmpty(thisLine) || thisLine[0] == '#') { continue; }
+
+ // Line should be in format "; ; ; # "
+
+ string[] split = thisLine.Split(';');
+ Assert.Equal(4, split.Length);
+
+ // We only support common and simple case folding; ignore everything else.
+
+ char status = split[1].AsSpan().Trim()[0];
+ if (status != 'C' && status != 'S')
+ {
+ continue;
+ }
+
+ int fromCodePoint = (int)uint.Parse(split[0], NumberStyles.HexNumber, CultureInfo.InvariantCulture);
+ int toCodePoint = (int)uint.Parse(split[2], NumberStyles.HexNumber, CultureInfo.InvariantCulture);
+ dict.Add(fromCodePoint, toCodePoint);
+ }
+
+ return dict;
+ }
+
+ ///
+ /// Reads DerivedBidiClass.txt and parses each entry in that file.
+ ///
+ private static Dictionary ProcessDerivedBidiClassFile()
+ {
+ using Stream stream = Resources.OpenResource(Resources.DerivedBidiClass);
+ using StreamReader reader = new StreamReader(stream);
+
+ Dictionary dict = new Dictionary();
+
+ string thisLine;
+ while ((thisLine = reader.ReadLine()) != null)
+ {
+ if (PropsFileEntry.TryParseLine(thisLine, out PropsFileEntry value))
+ {
+ BidiClass bidiClass = BidiClassMap[value.PropName];
+
+ for (int i = value.FirstCodePoint; i <= value.LastCodePoint /* inclusive */; i++)
+ {
+ dict.Add(i, bidiClass);
+ }
+ }
+ }
+
+ return dict;
+ }
+
+ ///
+ /// Reads DerivedName.txt and parses each entry in that file.
+ ///
+ private static Dictionary ProcessDerivedNameFile()
+ {
+ using Stream stream = Resources.OpenResource(Resources.DerivedName);
+ using StreamReader reader = new StreamReader(stream);
+
+ Dictionary dict = new Dictionary();
+
+ string thisLine;
+ while ((thisLine = reader.ReadLine()) != null)
+ {
+ if (PropsFileEntry.TryParseLine(thisLine, out PropsFileEntry value))
+ {
+ if (value.IsSingleCodePoint)
+ {
+ // Single code point of format "XXXX ; " (name shouldn't end with '*')
+
+ Assert.False(value.PropName.EndsWith('*'));
+ dict.Add(value.FirstCodePoint, value.PropName);
+ }
+ else
+ {
+ // Range of format "XXXX..YYYY ; -*"
+
+ Assert.True(value.PropName.EndsWith('*'));
+
+ string baseName = value.PropName[..^1];
+ for (int i = value.FirstCodePoint; i <= value.LastCodePoint /* inclusive */; i++)
+ {
+ dict.Add(i, baseName + i.ToString("X4", CultureInfo.InvariantCulture));
+ }
+ }
+ }
+ }
+
+ return dict;
+ }
+
+ ///
+ /// Reads GraphemeBreakProperty.txt and emoji-data.txt and parses each entry in those files.
+ ///
+ private static Dictionary ProcessGraphemeClusterBreakAndEmojiDataFiles()
+ {
+ Dictionary dict = new Dictionary();
+
+ foreach (string resourceName in new[] { Resources.GraphemeBreakProperty, Resources.EmojiData })
+ {
+ using Stream stream = Resources.OpenResource(resourceName);
+ using StreamReader reader = new StreamReader(stream);
+
+ string thisLine;
+ while ((thisLine = reader.ReadLine()) != null)
+ {
+ if (PropsFileEntry.TryParseLine(thisLine, out PropsFileEntry value))
+ {
+ if (Enum.TryParse(value.PropName, out GraphemeClusterBreakProperty property))
+ {
+ for (int i = value.FirstCodePoint; i <= value.LastCodePoint /* inclusive */; i++)
+ {
+ dict.Add(i, property);
+ }
+ }
+ }
+ }
+ }
+
+ return dict;
+ }
+
+ ///
+ /// Reads PropList.txt and parses each entry in that file.
+ ///
+ private static Dictionary ProcessPropListFile()
+ {
+ using Stream stream = Resources.OpenResource(Resources.PropList);
+ using StreamReader reader = new StreamReader(stream);
+
+ Dictionary dict = new Dictionary();
+
+ string thisLine;
+ while ((thisLine = reader.ReadLine()) != null)
+ {
+ // Expect "XXXX[..YYYY] ; # "
+
+ if (PropsFileEntry.TryParseLine(thisLine, out PropsFileEntry value))
+ {
+ CodePointFlags newFlag = Enum.Parse(value.PropName);
+ for (int i = value.FirstCodePoint; i <= value.LastCodePoint /* inclusive */; i++)
+ {
+ dict.TryGetValue(i, out CodePointFlags flagsForThisCodePoint /* could be default(T) */);
+ flagsForThisCodePoint |= newFlag;
+ dict[i] = flagsForThisCodePoint;
+ }
+ }
+ }
+
+ return dict;
+ }
+
+ ///
+ /// Reads UnicodeData.txt and parses each entry in that file.
+ ///
+ private static Dictionary ProcessUnicodeDataFile()
+ {
+ using Stream stream = Resources.OpenResource(Resources.UnicodeData);
+ using StreamReader reader = new StreamReader(stream);
+
+ Dictionary dict = new Dictionary();
+
+ string thisLine;
+ while ((thisLine = reader.ReadLine()) != null)
+ {
+ // Skip blank lines at beginning or end of the file
+
+ if (thisLine.Length == 0) { continue; }
+
+ UnicodeDataFileEntry entry = new UnicodeDataFileEntry(thisLine);
+
+ if (entry.Name.EndsWith(", First>", StringComparison.Ordinal))
+ {
+ // This is an entry of the form XXXX;;...
+ // We expect it to be followed by YYYY;;...
+
+ UnicodeDataFileEntry nextEntry = new UnicodeDataFileEntry(reader.ReadLine());
+ Assert.EndsWith(", Last>", nextEntry.Name, StringComparison.Ordinal);
+ Assert.Equal(entry.Name[..^", First>".Length], nextEntry.Name[..^", Last>".Length]);
+
+ string baseName = entry.Name.Remove(entry.Name.Length - ", First>".Length, ", First".Length); // remove the ", First" part of the name
+ for (int i = entry.CodePoint; i <= nextEntry.CodePoint /* inclusive */; i++)
+ {
+ dict.Add(i, new UnicodeDataFileEntry(i, baseName, entry.GeneralCategory));
+ }
+ }
+ else
+ {
+ // This is a single code point entry, not a range.
+
+ dict.Add(entry.CodePoint, entry);
+ }
+ }
+
+ return dict;
+ }
+ }
+}
diff --git a/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/PropsFileEntry.cs b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/PropsFileEntry.cs
new file mode 100644
index 00000000000000..ac56ffbe8ca743
--- /dev/null
+++ b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/PropsFileEntry.cs
@@ -0,0 +1,57 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Globalization;
+using System.Security.Policy;
+using System.Text.RegularExpressions;
+using Xunit;
+
+namespace System.Text.Unicode
+{
+ // Represents an entry in a Unicode props file.
+ // The expected format is "XXXX[..YYYY] ; [# ]".
+ internal sealed class PropsFileEntry
+ {
+ private static readonly Regex _regex = new Regex(@"^\s*(?[0-9a-f]{4,})(\.\.(?[0-9a-f]{4,}))?\s*;\s*(?.+?)\s*(#\s*(?.*))?$", RegexOptions.IgnoreCase);
+
+ public readonly int FirstCodePoint;
+ public readonly int LastCodePoint;
+ public readonly string PropName;
+
+ private PropsFileEntry(uint firstCodePoint, uint lastCodePoint, string propName)
+ {
+ Assert.True(firstCodePoint <= 0x10FFFF, "First code point is out of range.");
+ Assert.True(lastCodePoint <= 0x10FFFF, "Last code point is out of range.");
+ Assert.True(firstCodePoint <= lastCodePoint, "First code point is after last code point.");
+
+ FirstCodePoint = (int)firstCodePoint;
+ LastCodePoint = (int)lastCodePoint;
+ PropName = propName;
+ }
+
+ public bool IsSingleCodePoint => (FirstCodePoint == LastCodePoint);
+
+ public static bool TryParseLine(string line, out PropsFileEntry value)
+ {
+ Match match = _regex.Match(line);
+
+ if (!match.Success)
+ {
+ value = default; // no match
+ return false;
+ }
+
+ uint firstCodePoint = uint.Parse(match.Groups["firstCodePoint"].Value, NumberStyles.HexNumber, CultureInfo.InvariantCulture);
+ uint lastCodePoint = firstCodePoint; // assume no "..YYYY" segment for now
+
+ if (match.Groups["lastCodePoint"].Success)
+ {
+ lastCodePoint = uint.Parse(match.Groups["lastCodePoint"].Value, NumberStyles.HexNumber, CultureInfo.InvariantCulture);
+ }
+
+ value = new PropsFileEntry(firstCodePoint, lastCodePoint, match.Groups["propName"].Value);
+ return true;
+ }
+ }
+}
diff --git a/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/Resources.cs b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/Resources.cs
new file mode 100644
index 00000000000000..af473c7a3ddf80
--- /dev/null
+++ b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/Resources.cs
@@ -0,0 +1,25 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.IO;
+
+namespace System.Text.Unicode
+{
+ internal static class Resources
+ {
+ public const string CaseFolding = "CaseFolding.txt";
+ public const string DerivedBidiClass = "DerivedBidiClass.txt";
+ public const string DerivedName = "DerivedName.txt";
+ public const string EmojiData = "emoji-data.txt";
+ public const string GraphemeBreakProperty = "GraphemeBreakProperty.txt";
+ public const string PropList = "PropList.txt";
+ public const string UnicodeData = "UnicodeData.txt";
+
+ public static Stream OpenResource(string resourceName)
+ {
+ return typeof(Resources).Assembly.GetManifestResourceStream(resourceName)
+ ?? throw new ArgumentException(message: $"Resource {resourceName} not found.", paramName: nameof(resourceName));
+ }
+ }
+}
diff --git a/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/UnicodeData.cs b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/UnicodeData.cs
new file mode 100644
index 00000000000000..f5bba3856d6824
--- /dev/null
+++ b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/UnicodeData.cs
@@ -0,0 +1,40 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Threading;
+
+namespace System.Text.Unicode
+{
+ public static class UnicodeData
+ {
+ private static readonly CodePoint[] _codePointData = new CodePoint[0x11_0000]; // an array for all code points
+ private static readonly Lazy _lazyParsedData = new Lazy();
+
+ public static CodePoint GetData(int codePoint)
+ {
+ if ((uint)codePoint >= _codePointData.Length)
+ {
+ throw new ArgumentOutOfRangeException(
+ message: FormattableString.Invariant($"Value U+{(uint)codePoint:X4} is not a valid code point."),
+ paramName: nameof(codePoint));
+ }
+
+ CodePoint data = _codePointData[codePoint];
+
+ if (data is null)
+ {
+ // generate on-demand
+
+ data = new CodePoint(codePoint, _lazyParsedData.Value);
+ data ??= Interlocked.CompareExchange(ref _codePointData[codePoint], data, null);
+ }
+
+ return data;
+ }
+
+ public static CodePoint GetData(uint codePoint) => GetData((int)codePoint);
+
+ public static CodePoint GetData(Rune rune) => GetData(rune.Value);
+ }
+}
diff --git a/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/UnicodeDataFileEntry.cs b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/UnicodeDataFileEntry.cs
new file mode 100644
index 00000000000000..0659769159b065
--- /dev/null
+++ b/src/libraries/Common/tests/CoreFx.Private.TestUtilities.Unicode/System/Text/Unicode/UnicodeDataFileEntry.cs
@@ -0,0 +1,132 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Collections.Generic;
+using System.Globalization;
+using Xunit;
+
+namespace System.Text.Unicode
+{
+ // Represents an entry from UnicodeData.txt.
+ internal sealed class UnicodeDataFileEntry
+ {
+ private static readonly Dictionary UnicodeCategoryMap = new Dictionary
+ {
+ ["Lu"] = UnicodeCategory.UppercaseLetter,
+ ["Ll"] = UnicodeCategory.LowercaseLetter,
+ ["Lt"] = UnicodeCategory.TitlecaseLetter,
+ ["Lm"] = UnicodeCategory.ModifierLetter,
+ ["Lo"] = UnicodeCategory.OtherLetter,
+ ["Mn"] = UnicodeCategory.NonSpacingMark,
+ ["Mc"] = UnicodeCategory.SpacingCombiningMark,
+ ["Me"] = UnicodeCategory.EnclosingMark,
+ ["Nd"] = UnicodeCategory.DecimalDigitNumber,
+ ["Nl"] = UnicodeCategory.LetterNumber,
+ ["No"] = UnicodeCategory.OtherNumber,
+ ["Zs"] = UnicodeCategory.SpaceSeparator,
+ ["Zl"] = UnicodeCategory.LineSeparator,
+ ["Zp"] = UnicodeCategory.ParagraphSeparator,
+ ["Cc"] = UnicodeCategory.Control,
+ ["Cf"] = UnicodeCategory.Format,
+ ["Cs"] = UnicodeCategory.Surrogate,
+ ["Co"] = UnicodeCategory.PrivateUse,
+ ["Pc"] = UnicodeCategory.ConnectorPunctuation,
+ ["Pd"] = UnicodeCategory.DashPunctuation,
+ ["Ps"] = UnicodeCategory.OpenPunctuation,
+ ["Pe"] = UnicodeCategory.ClosePunctuation,
+ ["Pi"] = UnicodeCategory.InitialQuotePunctuation,
+ ["Pf"] = UnicodeCategory.FinalQuotePunctuation,
+ ["Po"] = UnicodeCategory.OtherPunctuation,
+ ["Sm"] = UnicodeCategory.MathSymbol,
+ ["Sc"] = UnicodeCategory.CurrencySymbol,
+ ["Sk"] = UnicodeCategory.ModifierSymbol,
+ ["So"] = UnicodeCategory.OtherSymbol,
+ ["Cn"] = UnicodeCategory.OtherNotAssigned,
+ };
+
+ public readonly int CodePoint;
+ public readonly string Name;
+ public readonly UnicodeCategory GeneralCategory;
+ public readonly int DecimalDigitValue;
+ public readonly int DigitValue;
+ public readonly double NumericValue;
+ public readonly int SimpleUppercaseMapping;
+ public readonly int SimpleLowercaseMapping;
+ public readonly int SimpleTitlecaseMapping;
+
+ // ctor used when UnicodeData.txt contains a range
+ public UnicodeDataFileEntry(int codePoint, string baseName, UnicodeCategory generalCategory)
+ {
+ CodePoint = codePoint;
+ Name = baseName;
+ GeneralCategory = generalCategory;
+
+ DecimalDigitValue = -1;
+ DigitValue = -1;
+ NumericValue = -1;
+
+ SimpleUppercaseMapping = codePoint;
+ SimpleLowercaseMapping = codePoint;
+ SimpleTitlecaseMapping = codePoint;
+ }
+
+ public UnicodeDataFileEntry(string line)
+ {
+ // The format of each line is listed at https://www.unicode.org/reports/tr44/#UnicodeData.txt.
+ // ';' is used as a separator, and we should have exactly 15 entries per line.
+
+ string[] split = line.Split(';');
+ Assert.Equal(15, split.Length);
+
+ CodePoint = (int)uint.Parse(split[0], NumberStyles.HexNumber, CultureInfo.InvariantCulture);
+ Name = split[1];
+ GeneralCategory = UnicodeCategoryMap[split[2]];
+
+ if (!int.TryParse(split[6], NumberStyles.HexNumber, CultureInfo.InvariantCulture, out DecimalDigitValue))
+ {
+ DecimalDigitValue = -1;
+ }
+
+ if (!int.TryParse(split[7], NumberStyles.HexNumber, CultureInfo.InvariantCulture, out DigitValue))
+ {
+ DigitValue = -1;
+ }
+
+ NumericValue = -1;
+ if (!string.IsNullOrEmpty(split[8]))
+ {
+ // Data is in the format "[-]M[/N]"
+
+ string numericValue = split[8];
+ int indexOfSlash = numericValue.IndexOf('/');
+
+ if (indexOfSlash < 0)
+ {
+ NumericValue = double.Parse(numericValue, NumberStyles.Integer, CultureInfo.InvariantCulture);
+ }
+ else
+ {
+ double numerator = double.Parse(numericValue.AsSpan(0, indexOfSlash), NumberStyles.Integer, CultureInfo.InvariantCulture);
+ double denominator = double.Parse(numericValue.AsSpan(indexOfSlash + 1), NumberStyles.Integer, CultureInfo.InvariantCulture);
+ NumericValue = numerator / denominator;
+ }
+ }
+
+ if (!int.TryParse(split[12], NumberStyles.HexNumber, CultureInfo.InvariantCulture, out SimpleUppercaseMapping))
+ {
+ SimpleUppercaseMapping = CodePoint;
+ }
+
+ if (!int.TryParse(split[13], NumberStyles.HexNumber, CultureInfo.InvariantCulture, out SimpleLowercaseMapping))
+ {
+ SimpleLowercaseMapping = CodePoint;
+ }
+
+ if (!int.TryParse(split[14], NumberStyles.HexNumber, CultureInfo.InvariantCulture, out SimpleTitlecaseMapping))
+ {
+ SimpleTitlecaseMapping = CodePoint;
+ }
+ }
+ }
+}
diff --git a/src/libraries/Microsoft.VisualBasic.Core/src/Microsoft/VisualBasic/Strings.vb b/src/libraries/Microsoft.VisualBasic.Core/src/Microsoft/VisualBasic/Strings.vb
index e49fed8a11222b..44f7678cbdcf67 100644
--- a/src/libraries/Microsoft.VisualBasic.Core/src/Microsoft/VisualBasic/Strings.vb
+++ b/src/libraries/Microsoft.VisualBasic.Core/src/Microsoft/VisualBasic/Strings.vb
@@ -1008,90 +1008,35 @@ RedimAndExit:
If (Expression Is Nothing) Then
Return ""
+ ElseIf Expression.Length <= 1 Then
+ Return Expression
End If
- Dim chars As Char()
- Dim uc As UnicodeCategory
- Dim ch As Char
- Dim SrcIndex, Length As Integer
-
- Length = Expression.Length
- If Length = 0 Then
- Return ""
- End If
-
- 'Detect if there are any graphemes that need special handling
- For SrcIndex = 0 To Length - 1
- ch = Expression.Chars(SrcIndex)
- uc = Char.GetUnicodeCategory(ch)
- If uc = UnicodeCategory.Surrogate OrElse
- uc = UnicodeCategory.NonSpacingMark OrElse
- uc = UnicodeCategory.SpacingCombiningMark OrElse
- uc = UnicodeCategory.EnclosingMark Then
- 'Need to use special handling
- Return InternalStrReverse(Expression, SrcIndex, Length)
- End If
- Next SrcIndex
-
- chars = Expression.ToCharArray()
- System.Array.Reverse(chars)
- Return New String(chars)
-
- End Function
-
- 'This routine handles reversing Strings containing graphemes
- ' GRAPHEME: a text element that is displayed as a single character
- '
- Private Function InternalStrReverse(ByVal Expression As String, ByVal SrcIndex As Integer, ByVal Length As Integer) As String
-
- Dim TextEnum As TextElementEnumerator
- Dim DestIndex, LastSrcIndex, NextSrcIndex As Integer
- Dim sb As StringBuilder
-
- 'This code can only be hit one time
- sb = New StringBuilder(Length)
- sb.Length = Length
-
- TextEnum = StringInfo.GetTextElementEnumerator(Expression, SrcIndex)
-
- 'Init enumerator position
- If Not TextEnum.MoveNext() Then
- Return ""
- End If
+ 'Use TextElementEnumerator to iterate through the grapheme clusters, then
+ 'add them to the destination array in reverse order. A grapheme cluster
+ 'is a text element that displays as a single character; it might consist
+ 'of multiple chars.
+ Dim TextEnum As TextElementEnumerator = StringInfo.GetTextElementEnumerator(Expression)
+ Dim Output(Expression.Length - 1) As Char
+ Dim LastSrcIndex As Integer
+ 'Initialize the enumerator
+ TextEnum.MoveNext()
LastSrcIndex = 0
- DestIndex = Length - 1
- 'Copy up the first surrogate found
- Do While LastSrcIndex < SrcIndex
- sb.Chars(DestIndex) = Expression.Chars(LastSrcIndex)
- DestIndex -= 1
- LastSrcIndex += 1
+ 'Iterate through the enumerator, performing a forward-copy of the source
+ 'expression to the end of the StringBuilder.
+ ' Example: input = [ (ABC) (D) (EFGH) (IJ) (K) ]
+ ' output = [ (K) (IJ) (EFGH) (D) (ABC) ]
+ Do While TextEnum.MoveNext()
+ Expression.CopyTo(LastSrcIndex, Output, Output.Length - TextEnum.ElementIndex, TextEnum.ElementIndex - LastSrcIndex)
+ LastSrcIndex = TextEnum.ElementIndex
Loop
- 'Now iterate through the text elements and copy them to the reversed string
- NextSrcIndex = TextEnum.ElementIndex
-
- Do While DestIndex >= 0
- SrcIndex = NextSrcIndex
+ 'The above loop won't hit the last element - we need to copy the remaining data manually
+ Expression.CopyTo(LastSrcIndex, Output, 0, Expression.Length - LastSrcIndex)
- 'Move to next element
- If (TextEnum.MoveNext()) Then
- NextSrcIndex = TextEnum.ElementIndex
- Else
- 'Point NextSrcIndex to end of string
- NextSrcIndex = Length
- End If
- LastSrcIndex = NextSrcIndex - 1
-
- Do While LastSrcIndex >= SrcIndex
- sb.Chars(DestIndex) = Expression.Chars(LastSrcIndex)
- DestIndex -= 1
- LastSrcIndex -= 1
- Loop
- Loop
-
- Return sb.ToString()
+ Return New String(Output)
End Function
diff --git a/src/libraries/System.Globalization/tests/System.Globalization.Tests.csproj b/src/libraries/System.Globalization/tests/System.Globalization.Tests.csproj
index 888ebb272e3036..04fab88cbfab25 100644
--- a/src/libraries/System.Globalization/tests/System.Globalization.Tests.csproj
+++ b/src/libraries/System.Globalization/tests/System.Globalization.Tests.csproj
@@ -101,7 +101,9 @@
+
+
@@ -114,17 +116,17 @@
-
- CharUnicodeInfo\UnicodeData.11.0.txt
- UnicodeData.11.0.txt
+
+
+ CharUnicodeInfo\UnicodeData.12.1.txt
+ UnicodeData.12.1.txt
-
- CharUnicodeInfo\UnicodeData8.0.txt
- UnicodeData.8.0.txt
-
-
- CharUnicodeInfo\UnicodeData6.3.txt
- UnicodeData6.3.txt
+
+ CharUnicodeInfo\GraphemeBreakTest-12.1.0.txt
+ GraphemeBreakTest-12.1.0.txt
+
+
+
diff --git a/src/libraries/System.Globalization/tests/System/Globalization/CharUnicodeInfoTestData.cs b/src/libraries/System.Globalization/tests/System/Globalization/CharUnicodeInfoTestData.cs
index 24b90192c32f9b..0ee520683008a0 100644
--- a/src/libraries/System.Globalization/tests/System/Globalization/CharUnicodeInfoTestData.cs
+++ b/src/libraries/System.Globalization/tests/System/Globalization/CharUnicodeInfoTestData.cs
@@ -13,10 +13,8 @@ public static class CharUnicodeInfoTestData
private static readonly Lazy> s_testCases = new Lazy>(() =>
{
List testCases = new List();
- string fileName =
- CharUnicodeInfo.GetUnicodeCategory('\u10D0') == UnicodeCategory.LowercaseLetter ? "UnicodeData.11.0.txt" :
- CharUnicodeInfo.GetUnicodeCategory('\u037f') == UnicodeCategory.OtherNotAssigned ? "UnicodeData6.3.txt" : "UnicodeData.8.0.txt";
- Stream stream = typeof(CharUnicodeInfoGetUnicodeCategoryTests).GetTypeInfo().Assembly.GetManifestResourceStream(fileName);
+ string fileName = "UnicodeData.12.1.txt";
+ Stream stream = typeof(CharUnicodeInfoTestData).GetTypeInfo().Assembly.GetManifestResourceStream(fileName);
using (StreamReader reader = new StreamReader(stream))
{
while (!reader.EndOfStream)
diff --git a/src/libraries/System.Globalization/tests/System/Globalization/CharUnicodeInfoTests.Generated.cs b/src/libraries/System.Globalization/tests/System/Globalization/CharUnicodeInfoTests.Generated.cs
new file mode 100644
index 00000000000000..d8c34a581b727e
--- /dev/null
+++ b/src/libraries/System.Globalization/tests/System/Globalization/CharUnicodeInfoTests.Generated.cs
@@ -0,0 +1,93 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Collections.Generic;
+using System.Text.Unicode;
+using Xunit;
+using Xunit.Sdk;
+
+namespace System.Globalization.Tests
+{
+ public partial class CharUnicodeInfoTests
+ {
+ [Fact]
+ public void GetDecimalDigitValue_Char()
+ {
+ for (int i = 0; i <= char.MaxValue; i++)
+ {
+ char ch = (char)i;
+
+ CodePoint knownGoodData = UnicodeData.GetData(ch);
+ int actualValue = CharUnicodeInfo.GetDecimalDigitValue(ch);
+
+ AssertEqual(knownGoodData.DecimalDigitValue, actualValue, nameof(CharUnicodeInfo.GetDecimalDigitValue), knownGoodData);
+ }
+ }
+
+ [Fact]
+ public void GetDigitValue_Char()
+ {
+ for (int i = 0; i <= char.MaxValue; i++)
+ {
+ char ch = (char)i;
+
+ CodePoint knownGoodData = UnicodeData.GetData(ch);
+ int actualValue = CharUnicodeInfo.GetDigitValue(ch);
+
+ AssertEqual(knownGoodData.DigitValue, actualValue, nameof(CharUnicodeInfo.GetDigitValue), knownGoodData);
+ }
+ }
+
+ [Fact]
+ public void GetNumericValue_Char()
+ {
+ for (int i = 0; i <= char.MaxValue; i++)
+ {
+ char ch = (char)i;
+
+ CodePoint knownGoodData = UnicodeData.GetData(ch);
+ double actualValue = CharUnicodeInfo.GetNumericValue(ch);
+
+ AssertEqual(knownGoodData.NumericValue, actualValue, nameof(CharUnicodeInfo.GetNumericValue), knownGoodData);
+ }
+ }
+
+ [Fact]
+ public void GetUnicodeCategory_Char()
+ {
+ for (int i = 0; i <= char.MaxValue; i++)
+ {
+ char ch = (char)i;
+
+ CodePoint knownGoodData = UnicodeData.GetData(ch);
+ UnicodeCategory actualCategory = CharUnicodeInfo.GetUnicodeCategory(ch);
+
+ AssertEqual(knownGoodData.GeneralCategory, actualCategory, nameof(CharUnicodeInfo.GetUnicodeCategory), knownGoodData);
+ }
+ }
+
+ [Fact]
+ public void GetUnicodeCategory_Int32()
+ {
+ for (int i = 0; i <= HIGHEST_CODE_POINT; i++)
+ {
+ CodePoint knownGoodData = UnicodeData.GetData(i);
+ UnicodeCategory actualCategory = CharUnicodeInfo.GetUnicodeCategory(i);
+
+ AssertEqual(knownGoodData.GeneralCategory, actualCategory, nameof(CharUnicodeInfo.GetUnicodeCategory), knownGoodData);
+ }
+ }
+
+ private static void AssertEqual(T expected, T actual, string methodName, CodePoint codePoint)
+ {
+ if (!EqualityComparer.Default.Equals(expected, actual))
+ {
+ throw new AssertActualExpectedException(
+ expected: expected,
+ actual: actual,
+ userMessage: FormattableString.Invariant($"CharUnicodeInfo.{methodName}({codePoint}) returned unexpected value."));
+ }
+ }
+ }
+}
diff --git a/src/libraries/System.Globalization/tests/System/Globalization/CharUnicodeInfoTests.cs b/src/libraries/System.Globalization/tests/System/Globalization/CharUnicodeInfoTests.cs
index 1abde82c1f817b..a4e49945a1095d 100644
--- a/src/libraries/System.Globalization/tests/System/Globalization/CharUnicodeInfoTests.cs
+++ b/src/libraries/System.Globalization/tests/System/Globalization/CharUnicodeInfoTests.cs
@@ -6,8 +6,10 @@
namespace System.Globalization.Tests
{
- public class CharUnicodeInfoGetUnicodeCategoryTests
+ public partial class CharUnicodeInfoTests
{
+ private const int HIGHEST_CODE_POINT = 0x10_FFFF;
+
[Fact]
public void GetUnicodeCategory()
{
@@ -20,9 +22,7 @@ public void GetUnicodeCategory()
}
// Test the string overload for a surrogate pair or a single char
GetUnicodeCategory(testCase.Utf32CodeValue, new UnicodeCategory[] { testCase.GeneralCategory });
-#if NETCOREAPP
Assert.Equal(testCase.GeneralCategory, CharUnicodeInfo.GetUnicodeCategory(testCase.CodePoint));
-#endif
}
}
@@ -104,10 +104,10 @@ public void GetNumericValue(string s, double[] expected)
public static readonly object[][] s_GetNumericValueData =
{
new object[] {"aA1!", new double[] { -1, -1, 1, -1 }},
- // Numeric surrogate (CUNEIFORM NUMERIC SIGN FIVE BAN2 VARIANT FORM)
+ // Numeric supplementary plane code point (U+12455 CUNEIFORM NUMERIC SIGN FIVE BAN2 VARIANT FORM)
new object[] {"\uD809\uDC55", new double[] { 5, -1 }},
new object[] {"a\uD809\uDC55a", new double[] { -1, 5, -1 , -1 }},
- // Numeric surrogate (CUNEIFORM NUMERIC SIGN FIVE BAN2 VARIANT FORM)
+ // Non-numeric supplementary plane code point (U+1236C CUNEIFORM SIGN ZU5 TIMES A)
new object[] {"\uD808\uDF6C", new double[] { -1, -1 }},
new object[] {"a\uD808\uDF6Ca", new double[] { -1, -1, -1, -1 }},
};
diff --git a/src/libraries/System.Globalization/tests/System/Globalization/GraphemeBreakTest.cs b/src/libraries/System.Globalization/tests/System/Globalization/GraphemeBreakTest.cs
new file mode 100644
index 00000000000000..b0400026acbb9c
--- /dev/null
+++ b/src/libraries/System.Globalization/tests/System/Globalization/GraphemeBreakTest.cs
@@ -0,0 +1,168 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using System.Text.Unicode;
+using Microsoft.VisualBasic;
+using Xunit;
+using Xunit.Sdk;
+
+namespace System.Globalization.Tests
+{
+ public class GraphemeBreakTest
+ {
+ private const char BREAK_REQUIRED = '\u00F7'; // DIVISION SIGN
+ private const char BREAK_FORBIDDEN = '\u00D7'; // MULTIPLICATION SIGN
+
+ [Fact]
+ public void CompareRuntimeImplementationAgainstUnicodeTestData()
+ {
+ foreach ((Rune[][] clusters, string line) in GetGraphemeBreakTestData())
+ {
+ // Arrange
+
+ List expected = new List();
+ StringBuilder input = new StringBuilder();
+
+ foreach (Rune[] cluster in clusters)
+ {
+ expected.Add(input.Length); // we're about to start a new cluster
+ foreach (Rune scalar in cluster)
+ {
+ input.Append(scalar);
+ }
+ }
+
+ // Act
+
+ int[] actual = StringInfo.ParseCombiningCharacters(input.ToString());
+
+ // Assert
+
+ if (!expected.SequenceEqual(actual))
+ {
+ throw new AssertActualExpectedException(
+ expected: expected.ToArray(),
+ actual: actual,
+ userMessage: "Grapheme break test failed on test case: " + line);
+ }
+ }
+ }
+
+ [Fact]
+ public void VisualBasicReverseString()
+ {
+ foreach ((Rune[][] clusters, string line) in GetGraphemeBreakTestData())
+ {
+ // Arrange
+
+ string forwardActual = string.Concat(clusters.SelectMany(cluster => cluster).Select(rune => rune.ToString()));
+ string reverseExpected = string.Concat(clusters.Reverse().SelectMany(cluster => cluster).Select(rune => rune.ToString()));
+
+ // Act
+
+ string reverseActual = Strings.StrReverse(forwardActual);
+
+ // Assert
+
+ if (reverseExpected != reverseActual)
+ {
+ throw new AssertActualExpectedException(
+ expected: PrintCodePointsForDebug(reverseExpected),
+ actual: PrintCodePointsForDebug(reverseActual),
+ userMessage: "Grapheme break test failed on test case: " + line);
+ }
+ }
+ }
+
+ [Fact]
+ public void ReplacementCharHasTypeOther()
+ {
+ // We rely on U+FFFD REPLACEMENT CHARACTER having certain properties
+ // (such as a grapheme boundary property of "Other-XX"), since unpaired
+ // UTF-16 surrogate code points and other ill-formed subsequences are normalized
+ // to this character. If we ingest a new version of the UCD files where this
+ // has been changed, we probably need to update the logic in StringInfo
+ // and related types.
+
+ Assert.Equal(GraphemeClusterBreakProperty.Other, UnicodeData.GetData('\ud800').GraphemeClusterBreakProperty);
+ }
+
+ private static IEnumerable<(Rune[][] clusters, string line)> GetGraphemeBreakTestData()
+ {
+ using Stream stream = typeof(GraphemeBreakTest).Assembly.GetManifestResourceStream("GraphemeBreakTest-12.1.0.txt");
+ using StreamReader reader = new StreamReader(stream);
+
+ string line;
+ while ((line = reader.ReadLine()) != null)
+ {
+ // Skip blank or comment-only lines
+
+ if (string.IsNullOrEmpty(line) || line[0] == '#')
+ {
+ continue;
+ }
+
+ // Line has format "÷ (XXXX (× YYYY)* ÷)+ # "
+ // We'll yield return a Rune[][], representing a collection of clusters, where each cluster contains a collection of Runes.
+ //
+ // Example: "÷ AAAA ÷ BBBB × CCCC × DDDD ÷ EEEE × FFFF ÷ # "
+ // -> [ [ AAAA ], [ BBBB, CCCC, DDDD ], [ EEEE, FFFF ] ]
+ //
+ // We also return the line for ease of debugging any test failures.
+
+ string[] clusters = line[..line.IndexOf('#')].Trim().Split(BREAK_REQUIRED, StringSplitOptions.RemoveEmptyEntries);
+
+ yield return (Array.ConvertAll(clusters, cluster =>
+ {
+ string[] scalarsWithinClusterAsStrings = cluster.Split(BREAK_FORBIDDEN, StringSplitOptions.RemoveEmptyEntries);
+ uint[] scalarsWithinClusterAsUInt32s = Array.ConvertAll(scalarsWithinClusterAsStrings, scalar => uint.Parse(scalar, NumberStyles.HexNumber, CultureInfo.InvariantCulture));
+ Rune[] scalarsWithinClusterAsRunes = Array.ConvertAll(scalarsWithinClusterAsUInt32s, scalar => new Rune(scalar));
+ return scalarsWithinClusterAsRunes;
+ }), line);
+ }
+ }
+
+ // Given a sequence of UTF-16 code points, prints them in "[ XXXX YYYY ZZZZ ]" form, combining surrogates where possible
+ private static string PrintCodePointsForDebug(IEnumerable input)
+ {
+ StringBuilder sb = new StringBuilder();
+ sb.Append("[ ");
+
+ IEnumerator enumerator = input.GetEnumerator();
+ while (enumerator.MoveNext())
+ {
+ SawStandaloneChar:
+ char thisChar = enumerator.Current;
+
+ if (!char.IsHighSurrogate(thisChar) || !enumerator.MoveNext())
+ {
+ // not a high surrogate, or a high surrogate at the end of the sequence - it goes as-is
+ sb.AppendFormat(CultureInfo.InvariantCulture, "{0:X4} ", (uint)thisChar);
+ }
+ else
+ {
+ char secondChar = enumerator.Current;
+ if (!char.IsLowSurrogate(secondChar))
+ {
+ // previous char was a standalone high surrogate char - send it as-is
+ sb.AppendFormat(CultureInfo.InvariantCulture, "{0:X4} ", (uint)thisChar);
+ goto SawStandaloneChar;
+ }
+ else
+ {
+ // surrogate pair - extract supplementary code point
+ sb.AppendFormat(CultureInfo.InvariantCulture, "{0:X4} ", (uint)char.ConvertToUtf32(thisChar, secondChar));
+ }
+ }
+ }
+
+ sb.Append("]");
+ return sb.ToString();
+ }
+ }
+}
diff --git a/src/libraries/System.Globalization/tests/System/Globalization/StringInfoTests.cs b/src/libraries/System.Globalization/tests/System/Globalization/StringInfoTests.cs
index e97a71300b8ca7..070d8dd4853d8e 100644
--- a/src/libraries/System.Globalization/tests/System/Globalization/StringInfoTests.cs
+++ b/src/libraries/System.Globalization/tests/System/Globalization/StringInfoTests.cs
@@ -3,7 +3,6 @@
// See the LICENSE file in the project root for more information.
using System.Collections.Generic;
-using System.Reflection;
using Xunit;
namespace System.Globalization.Tests
@@ -143,7 +142,7 @@ public static IEnumerable