Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

namespace System.Text.Unicode
{
/// <summary>
/// Bidi class values from UAX#44.
/// </summary>
/// <remarks>
/// See https://www.unicode.org/reports/tr44/#BC_Values_Table
/// and https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt (bc).
/// </remarks>
public enum BidiClass
{
Arabic_Letter,
Arabic_Number,
Paragraph_Separator,
Boundary_Neutral,
Common_Separator,
European_Number,
European_Separator,
European_Terminator,
First_Strong_Isolate,
Left_To_Right,
Left_To_Right_Embedding,
Left_To_Right_Isolate,
Left_To_Right_Override,
Nonspacing_Mark,
Other_Neutral,
Pop_Directional_Format,
Pop_Directional_Isolate,
Right_To_Left,
Right_To_Left_Embedding,
Right_To_Left_Isolate,
Right_To_Left_Override,
Segment_Separator,
White_Space,
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Buffers.Binary;
using System.Globalization;
using System.Text;
using System.Text.Unicode;

namespace GenUnicodeProp
{
/// <summary>
/// Contains information about a code point's Unicode category,
/// bidi class, and simple case mapping / folding.
/// </summary>
internal sealed class CategoryCasingInfo : IEquatable<CategoryCasingInfo>
{
private readonly (UnicodeCategory generalCategory,
StrongBidiCategory strongBidiCategory,
ushort offsetToSimpleUppercase,
ushort offsetToSimpleLowercase,
ushort offsetToSimpleTitlecase,
ushort offsetToSimpleCasefold,
bool isWhitespace) _data;

public CategoryCasingInfo(CodePoint codePoint)
{
_data.generalCategory = codePoint.GeneralCategory;

switch (codePoint.BidiClass)
{
case BidiClass.Left_To_Right:
_data.strongBidiCategory = StrongBidiCategory.StrongLeftToRight;
break;

case BidiClass.Right_To_Left:
case BidiClass.Arabic_Letter:
_data.strongBidiCategory = StrongBidiCategory.StrongRightToLeft;
break;

default:
_data.strongBidiCategory = StrongBidiCategory.Other;
break;
}

if (Program.IncludeCasingData)
{
_data.offsetToSimpleUppercase = (ushort)(codePoint.SimpleUppercaseMapping - codePoint.Value);
_data.offsetToSimpleLowercase = (ushort)(codePoint.SimpleLowercaseMapping - codePoint.Value);
_data.offsetToSimpleTitlecase = (ushort)(codePoint.SimpleTitlecaseMapping - codePoint.Value);
_data.offsetToSimpleCasefold = (ushort)(codePoint.SimpleCaseFoldMapping - codePoint.Value);
}
else
{
_data.offsetToSimpleUppercase = default;
_data.offsetToSimpleLowercase = default;
_data.offsetToSimpleTitlecase = default;
_data.offsetToSimpleCasefold = default;
}

_data.isWhitespace = codePoint.Flags.HasFlag(CodePointFlags.White_Space);
}

public override bool Equals(object obj) => Equals(obj as CategoryCasingInfo);

public bool Equals(CategoryCasingInfo other)
{
return !(other is null) && this._data.Equals(other._data);
}

public override int GetHashCode()
{
return _data.GetHashCode();
}

public static byte[] ToCategoryBytes(CategoryCasingInfo input)
{
// We're storing 3 pieces of information in 8 bits:
// bit 7 (high bit) = isWhitespace?
// bits 6..5 = restricted bidi class
// bits 4..0 = Unicode category

int combinedValue = Convert.ToInt32(input._data.isWhitespace) << 7;
combinedValue += (int)input._data.strongBidiCategory << 5;
combinedValue += (int)input._data.generalCategory;

return new byte[] { checked((byte)combinedValue) };
}

public static byte[] ToUpperBytes(CategoryCasingInfo input)
{
byte[] bytes = new byte[sizeof(ushort)];
BinaryPrimitives.WriteUInt16LittleEndian(bytes, input._data.offsetToSimpleUppercase);
return bytes;
}

public static byte[] ToLowerBytes(CategoryCasingInfo input)
{
byte[] bytes = new byte[sizeof(ushort)];
BinaryPrimitives.WriteUInt16LittleEndian(bytes, input._data.offsetToSimpleLowercase);
return bytes;
}

public static byte[] ToTitleBytes(CategoryCasingInfo input)
{
byte[] bytes = new byte[sizeof(ushort)];
BinaryPrimitives.WriteUInt16LittleEndian(bytes, input._data.offsetToSimpleTitlecase);
return bytes;
}

public static byte[] ToCaseFoldBytes(CategoryCasingInfo input)
{
byte[] bytes = new byte[sizeof(ushort)];
BinaryPrimitives.WriteUInt16LittleEndian(bytes, input._data.offsetToSimpleCasefold);
return bytes;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Globalization;
using Xunit;

namespace System.Text.Unicode
{
/// <summary>
/// Represents a Unicode code point (U+0000..U+10FFFF).
/// </summary>
public sealed class CodePoint : IEquatable<CodePoint>
{
internal CodePoint(int value, ParsedUnicodeData parsedData)
{
if ((uint)value > 0x10_FFFF)
{
throw new ArgumentOutOfRangeException(
message: FormattableString.Invariant($"Value U+{(uint)value:X4} is not a valid code point."),
paramName: nameof(value));
}

Assert.NotNull(parsedData);

Value = value;

if (parsedData.DerivedBidiClassData.TryGetValue(value, out BidiClass bidiClass))
{
BidiClass = bidiClass;
}

if (parsedData.PropListData.TryGetValue(value, out CodePointFlags flags))
{
Flags = flags;
}

// All code points by default case convert to themselves.

SimpleLowercaseMapping = value;
SimpleTitlecaseMapping = value;
SimpleUppercaseMapping = value;

if (parsedData.UnicodeDataData.TryGetValue(value, out UnicodeDataFileEntry entry))
{
GeneralCategory = entry.GeneralCategory;
DecimalDigitValue = entry.DecimalDigitValue;
DigitValue = entry.DigitValue;
Name = entry.Name;
NumericValue = entry.NumericValue;
SimpleLowercaseMapping = entry.SimpleLowercaseMapping;
SimpleTitlecaseMapping = entry.SimpleTitlecaseMapping;
SimpleUppercaseMapping = entry.SimpleUppercaseMapping;
}

// All code points by default case fold to themselves.

SimpleCaseFoldMapping = value;

if (parsedData.CaseFoldingData.TryGetValue(value, out int caseFoldsTo))
{
SimpleCaseFoldMapping = caseFoldsTo;
}

// Can we get a better name for this code point?

if (parsedData.DerivedNameData.TryGetValue(value, out string preferredName))
{
Name = preferredName;
}

// Finally, get the grapheme cluster break value.

if (parsedData.GraphemeBreakPropertyData.TryGetValue(value, out GraphemeClusterBreakProperty graphemeProperty))
{
GraphemeClusterBreakProperty = graphemeProperty;
}
}

/// <summary>
/// The bidi class of this code point. Note that even unassigned code points can
/// have a non-default bidi class.
/// </summary>
/// <remarks>
/// See https://www.unicode.org/reports/tr44/#Bidi_Class.
/// </remarks>
public BidiClass BidiClass { get; } = BidiClass.Left_To_Right; // default is "L" (strong left-to-right)

/// <summary>
/// The decimal digit value (0..9) of this code point, or -1 if this code point
/// does not have a decimal digit value.
/// </summary>
/// <remarks>
/// See https://www.unicode.org/reports/tr44/#Numeric_Value, field (6).
/// </remarks>
public int DecimalDigitValue { get; } = -1; // default is "not a decimal digit"

/// <summary>
/// The digit value (0..9) of this code point, or -1 if this code point
/// does not have a digit value.
/// </summary>
/// <remarks>
/// See https://www.unicode.org/reports/tr44/#Numeric_Value, field (7).
/// </remarks>
public int DigitValue { get; } = -1; // default is "not a digit"

/// <summary>
/// Any flags associated with this code point, such as "is white space?"
/// </summary>
/// <remarks>
/// See https://www.unicode.org/reports/tr44/#PropList.txt.
/// </remarks>
public CodePointFlags Flags { get; } = default; // default is "no flags"

/// <summary>
/// The general Unicode category of this code point.
/// </summary>
/// <remarks>
/// See https://www.unicode.org/reports/tr44/#UnicodeData.txt.
/// </remarks>
public UnicodeCategory GeneralCategory { get; } = UnicodeCategory.OtherNotAssigned; // default is "Unassigned"

/// <summary>
/// The grapheme cluster break property of this code point.
/// </summary>
/// <remarks>
/// See https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values.
/// </remarks>
public GraphemeClusterBreakProperty GraphemeClusterBreakProperty { get; } = GraphemeClusterBreakProperty.Other; // default is "Other"

/// <summary>
/// The name of this code point.
/// </summary>
/// <remarks>
/// See https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedName.txt.
/// </remarks>
public string Name { get; } = "<Unassigned>";

/// <summary>
/// The numeric value of this code point, or -1 if this code point
/// does not have a numeric value.
/// </summary>
/// <remarks>
/// See https://www.unicode.org/reports/tr44/#Numeric_Value, field (8).
/// </remarks>
public double NumericValue { get; } = -1; // default is "not a numeric value"

/// <summary>
/// The code point that results from performing a simple case fold mapping of this code point.
/// </summary>
/// <remarks>
/// See https://www.unicode.org/reports/tr44/#CaseFolding.txt
/// and https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt.
/// </remarks>
public int SimpleCaseFoldMapping { get; }

/// <summary>
/// The code point that results from performing a simple lowercase mapping of this code point.
/// </summary>
/// <remarks>
/// See https://www.unicode.org/reports/tr44/#Simple_Lowercase_Mapping.
/// </remarks>
public int SimpleLowercaseMapping { get; }

/// <summary>
/// The code point that results from performing a simple titlecase mapping of this code point.
/// </summary>
/// <remarks>
/// See https://www.unicode.org/reports/tr44/#Simple_Titlecase_Mapping.
/// </remarks>
public int SimpleTitlecaseMapping { get; }

/// <summary>
/// The code point that results from performing a simple uppercase mapping of this code point.
/// </summary>
/// <remarks>
/// See https://www.unicode.org/reports/tr44/#Simple_Uppercase_Mapping.
/// </remarks>
public int SimpleUppercaseMapping { get; }

/// <summary>
/// The value (0000..10FFFF) of this code point.
/// </summary>
public int Value { get; }

public override bool Equals(object obj) => Equals(obj as CodePoint);

public bool Equals(CodePoint obj)
{
if (obj is null)
{
return false;
}

return this.Value == obj.Value;
}

public override int GetHashCode()
{
return Value;
}

public override string ToString()
{
return FormattableString.Invariant($"U+{(uint)Value:X4} {Name}");
}
}
}
Loading