Skip to content

Commit 422fef8

Browse files
Update CharUnicodeInfo to Unicode 12.1; update StringInfo to UAX29 (#328)
- Updates CharUnicodeInfo.GetCategory from Unicode 11.0 to Unicode 12.1 - Optimizes some methods in Char and CharUnicodeInfo to have better codegen - Updates StringInfo to use UAX29 extended grapheme cluster segmentation logic
1 parent 7b92525 commit 422fef8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+4575
-2332
lines changed

eng/Versions.props

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
<SystemIOCompressionTestDataVersion>5.0.0-beta.19608.5</SystemIOCompressionTestDataVersion>
6262
<SystemIOPackagingTestDataVersion>5.0.0-beta.19608.5</SystemIOPackagingTestDataVersion>
6363
<SystemNetTestDataVersion>5.0.0-beta.19608.5</SystemNetTestDataVersion>
64+
<SystemPrivateRuntimeUnicodeDataVersion>5.0.0-beta.19610.1</SystemPrivateRuntimeUnicodeDataVersion>
6465
<SystemSecurityCryptographyX509CertificatesTestDataVersion>5.0.0-beta.19608.5</SystemSecurityCryptographyX509CertificatesTestDataVersion>
6566
<SystemWindowsExtensionsTestDataVersion>5.0.0-beta.19608.5</SystemWindowsExtensionsTestDataVersion>
6667
<!-- Standard dependencies -->
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using System;
6+
using System.Buffers.Binary;
7+
using System.Globalization;
8+
using System.Text;
9+
using System.Text.Unicode;
10+
11+
namespace GenUnicodeProp
12+
{
13+
/// <summary>
14+
/// Contains information about a code point's Unicode category,
15+
/// bidi class, and simple case mapping / folding.
16+
/// </summary>
17+
internal sealed class CategoryCasingInfo : IEquatable<CategoryCasingInfo>
18+
{
19+
private readonly (UnicodeCategory generalCategory,
20+
StrongBidiCategory strongBidiCategory,
21+
ushort offsetToSimpleUppercase,
22+
ushort offsetToSimpleLowercase,
23+
ushort offsetToSimpleTitlecase,
24+
ushort offsetToSimpleCasefold,
25+
bool isWhitespace) _data;
26+
27+
public CategoryCasingInfo(CodePoint codePoint)
28+
{
29+
_data.generalCategory = codePoint.GeneralCategory;
30+
31+
switch (codePoint.BidiClass)
32+
{
33+
case BidiClass.Left_To_Right:
34+
_data.strongBidiCategory = StrongBidiCategory.StrongLeftToRight;
35+
break;
36+
37+
case BidiClass.Right_To_Left:
38+
case BidiClass.Arabic_Letter:
39+
_data.strongBidiCategory = StrongBidiCategory.StrongRightToLeft;
40+
break;
41+
42+
default:
43+
_data.strongBidiCategory = StrongBidiCategory.Other;
44+
break;
45+
}
46+
47+
if (Program.IncludeCasingData)
48+
{
49+
_data.offsetToSimpleUppercase = (ushort)(codePoint.SimpleUppercaseMapping - codePoint.Value);
50+
_data.offsetToSimpleLowercase = (ushort)(codePoint.SimpleLowercaseMapping - codePoint.Value);
51+
_data.offsetToSimpleTitlecase = (ushort)(codePoint.SimpleTitlecaseMapping - codePoint.Value);
52+
_data.offsetToSimpleCasefold = (ushort)(codePoint.SimpleCaseFoldMapping - codePoint.Value);
53+
}
54+
else
55+
{
56+
_data.offsetToSimpleUppercase = default;
57+
_data.offsetToSimpleLowercase = default;
58+
_data.offsetToSimpleTitlecase = default;
59+
_data.offsetToSimpleCasefold = default;
60+
}
61+
62+
_data.isWhitespace = codePoint.Flags.HasFlag(CodePointFlags.White_Space);
63+
}
64+
65+
public override bool Equals(object obj) => Equals(obj as CategoryCasingInfo);
66+
67+
public bool Equals(CategoryCasingInfo other)
68+
{
69+
return !(other is null) && this._data.Equals(other._data);
70+
}
71+
72+
public override int GetHashCode()
73+
{
74+
return _data.GetHashCode();
75+
}
76+
77+
public static byte[] ToCategoryBytes(CategoryCasingInfo input)
78+
{
79+
// We're storing 3 pieces of information in 8 bits:
80+
// bit 7 (high bit) = isWhitespace?
81+
// bits 6..5 = restricted bidi class
82+
// bits 4..0 = Unicode category
83+
84+
int combinedValue = Convert.ToInt32(input._data.isWhitespace) << 7;
85+
combinedValue += (int)input._data.strongBidiCategory << 5;
86+
combinedValue += (int)input._data.generalCategory;
87+
88+
return new byte[] { checked((byte)combinedValue) };
89+
}
90+
91+
public static byte[] ToUpperBytes(CategoryCasingInfo input)
92+
{
93+
byte[] bytes = new byte[sizeof(ushort)];
94+
BinaryPrimitives.WriteUInt16LittleEndian(bytes, input._data.offsetToSimpleUppercase);
95+
return bytes;
96+
}
97+
98+
public static byte[] ToLowerBytes(CategoryCasingInfo input)
99+
{
100+
byte[] bytes = new byte[sizeof(ushort)];
101+
BinaryPrimitives.WriteUInt16LittleEndian(bytes, input._data.offsetToSimpleLowercase);
102+
return bytes;
103+
}
104+
105+
public static byte[] ToTitleBytes(CategoryCasingInfo input)
106+
{
107+
byte[] bytes = new byte[sizeof(ushort)];
108+
BinaryPrimitives.WriteUInt16LittleEndian(bytes, input._data.offsetToSimpleTitlecase);
109+
return bytes;
110+
}
111+
112+
public static byte[] ToCaseFoldBytes(CategoryCasingInfo input)
113+
{
114+
byte[] bytes = new byte[sizeof(ushort)];
115+
BinaryPrimitives.WriteUInt16LittleEndian(bytes, input._data.offsetToSimpleCasefold);
116+
return bytes;
117+
}
118+
}
119+
}

src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/DataTable.cs

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -170,30 +170,4 @@ public byte[][] GetBytes()
170170
return new[] { Level1Index.ToArray(), level2.ToArray(), Level3Data.ToArray() };
171171
}
172172
}
173-
174-
internal sealed class FlatDataTable
175-
{
176-
// If a codepoint does not have data, this specifies the default value.
177-
private readonly string DefaultValue;
178-
private readonly Func<string, byte[]> GetValueBytesCallback;
179-
180-
// This contains the data mapping between codepoints and values.
181-
private readonly SortedDictionary<uint, string> RawData = new SortedDictionary<uint, string>();
182-
183-
public FlatDataTable(string defaultValue, Func<string, byte[]> getValueBytesCallback)
184-
{
185-
DefaultValue = defaultValue;
186-
GetValueBytesCallback = getValueBytesCallback;
187-
}
188-
189-
public void AddData(uint codepoint, string value) => RawData[codepoint] = value;
190-
191-
public byte[] GetBytesFlat()
192-
{
193-
var str = new List<byte>();
194-
foreach (var v in RawData.Values)
195-
str.AddRange(GetValueBytesCallback(v ?? DefaultValue));
196-
return str.ToArray();
197-
}
198-
}
199173
}

src/coreclr/src/System.Private.CoreLib/Tools/GenUnicodeProp/GenUnicodeProp.csproj

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,51 @@
22

33
<PropertyGroup>
44
<OutputType>Exe</OutputType>
5-
<TargetFramework>$(NetCoreAppCurrent)</TargetFramework>
5+
<TargetFramework>netcoreapp3.0</TargetFramework>
6+
<UnicodeUcdVersion>12.1</UnicodeUcdVersion>
67
</PropertyGroup>
78

9+
<ItemGroup>
10+
<Compile Include="$(LibrariesProjectRoot)\Common\tests\CoreFx.Private.TestUtilities.Unicode\**\*.cs" />
11+
</ItemGroup>
12+
13+
<ItemGroup>
14+
<PackageReference Include="System.Private.Runtime.UnicodeData" Version="$(SystemPrivateRuntimeUnicodeDataVersion)" ExcludeAssets="contentFiles" GeneratePathProperty="true" />
15+
<EmbeddedResource Include="$(PkgSystem_Private_Runtime_UnicodeData)\contentFiles\any\any\$(UnicodeUcdVersion).0\ucd\CaseFolding.txt">
16+
<Link>UnicodeData\CaseFolding.txt</Link>
17+
<LogicalName>CaseFolding.txt</LogicalName>
18+
</EmbeddedResource>
19+
<EmbeddedResource Include="$(PkgSystem_Private_Runtime_UnicodeData)\contentFiles\any\any\$(UnicodeUcdVersion).0\ucd\PropList.txt">
20+
<Link>UnicodeData\PropList.txt</Link>
21+
<LogicalName>PropList.txt</LogicalName>
22+
</EmbeddedResource>
23+
<EmbeddedResource Include="$(PkgSystem_Private_Runtime_UnicodeData)\contentFiles\any\any\$(UnicodeUcdVersion).0\ucd\UnicodeData.txt">
24+
<Link>UnicodeData\UnicodeData.txt</Link>
25+
<LogicalName>UnicodeData.txt</LogicalName>
26+
</EmbeddedResource>
27+
<EmbeddedResource Include="$(PkgSystem_Private_Runtime_UnicodeData)\contentFiles\any\any\$(UnicodeUcdVersion).0\ucd\auxiliary\GraphemeBreakProperty.txt">
28+
<Link>UnicodeData\GraphemeBreakProperty.txt</Link>
29+
<LogicalName>GraphemeBreakProperty.txt</LogicalName>
30+
</EmbeddedResource>
31+
<EmbeddedResource Include="$(PkgSystem_Private_Runtime_UnicodeData)\contentFiles\any\any\$(UnicodeUcdVersion).0\ucd\extracted\DerivedBidiClass.txt">
32+
<Link>UnicodeData\DerivedBidiClass.txt</Link>
33+
<LogicalName>DerivedBidiClass.txt</LogicalName>
34+
</EmbeddedResource>
35+
<EmbeddedResource Include="$(PkgSystem_Private_Runtime_UnicodeData)\contentFiles\any\any\$(UnicodeUcdVersion).0\ucd\extracted\DerivedName.txt">
36+
<Link>UnicodeData\DerivedName.txt</Link>
37+
<LogicalName>DerivedName.txt</LogicalName>
38+
</EmbeddedResource>
39+
<EmbeddedResource Include="$(PkgSystem_Private_Runtime_UnicodeData)\contentFiles\any\any\emoji\$(UnicodeUcdVersion)\emoji-data.txt">
40+
<Link>UnicodeData\emoji-data.txt</Link>
41+
<LogicalName>emoji-data.txt</LogicalName>
42+
</EmbeddedResource>
43+
</ItemGroup>
44+
45+
<ItemGroup>
46+
<PackageReference Include="Microsoft.DotNet.PlatformAbstractions" Version="$(MicrosoftDotNetPlatformAbstractionsVersion)" />
47+
<PackageReference Include="Microsoft.DotNet.XUnitExtensions" Version="$(MicrosoftDotNetXUnitExtensionsVersion)" />
48+
<PackageReference Include="xunit.core" Version="$(XUnitVersion)" ExcludeAssets="build" />
49+
<PackageReference Include="xunit.assert" Version="$(XUnitVersion)" />
50+
</ItemGroup>
51+
852
</Project>
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using System;
6+
using System.Buffers.Binary;
7+
using System.Runtime.CompilerServices;
8+
using System.Text.Unicode;
9+
10+
namespace GenUnicodeProp
11+
{
12+
/// <summary>
13+
/// Contains information about a code point's numeric representation
14+
/// and the manner in which it's treated for grapheme cluster segmentation
15+
/// purposes.
16+
/// </summary>
17+
internal sealed class NumericGraphemeInfo : IEquatable<NumericGraphemeInfo>
18+
{
19+
public readonly (int decimalDigitValue,
20+
int digitValue,
21+
double numericValue,
22+
GraphemeClusterBreakProperty graphemeClusterBreakProperty) _data;
23+
24+
public NumericGraphemeInfo(CodePoint codePoint)
25+
{
26+
_data.decimalDigitValue = codePoint.DecimalDigitValue;
27+
_data.digitValue = codePoint.DigitValue;
28+
_data.numericValue = codePoint.NumericValue;
29+
_data.graphemeClusterBreakProperty = codePoint.GraphemeClusterBreakProperty;
30+
}
31+
32+
public override bool Equals(object obj) => Equals(obj as NumericGraphemeInfo);
33+
34+
public bool Equals(NumericGraphemeInfo other)
35+
{
36+
return !(other is null) && this._data.Equals(other._data);
37+
}
38+
39+
public override int GetHashCode()
40+
{
41+
return _data.GetHashCode();
42+
}
43+
44+
public static byte[] ToDigitBytes(NumericGraphemeInfo input)
45+
{
46+
// Bits 4 .. 7 contain (decimalDigitValue + 1).
47+
// Bits 0 .. 3 contain (digitValue + 1).
48+
// This means that each nibble will have a value 0x0 .. 0xa, inclusive.
49+
50+
int adjustedDecimalDigitValue = input._data.decimalDigitValue + 1;
51+
int adjustedDigitValue = input._data.digitValue + 1;
52+
53+
return new byte[] { (byte)((adjustedDecimalDigitValue << 4) | adjustedDigitValue) };
54+
}
55+
56+
public static byte[] ToNumericBytes(NumericGraphemeInfo input)
57+
{
58+
byte[] bytes = new byte[sizeof(double)];
59+
double value = input._data.numericValue;
60+
BinaryPrimitives.WriteUInt64LittleEndian(bytes, Unsafe.As<double, ulong>(ref value));
61+
return bytes;
62+
}
63+
64+
public static byte[] ToGraphemeBytes(NumericGraphemeInfo input)
65+
{
66+
return new byte[] { checked((byte)input._data.graphemeClusterBreakProperty) };
67+
}
68+
}
69+
}

0 commit comments

Comments
 (0)