Skip to content
This repository has been archived by the owner on Aug 2, 2023. It is now read-only.

Started to refactor Utf8 encoder #979

Merged
merged 1 commit into from
Nov 15, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions src/System.Text.Utf8/System/Text/Utf8/Utf8Encoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// Licensed under the MIT license. See LICENSE file in the project root for full license information.

using System.Runtime.CompilerServices;
using System.Text.Utf16;

namespace System.Text.Utf8
{
Expand Down Expand Up @@ -243,6 +244,61 @@ public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span<byte> buf
return false;
}
}

// TODO: this routime needs to be optimized.
public static bool TryEncode(ReadOnlySpan<char> utf16source, Span<byte> utf8Destination, out int encodedBytes)
{
var utf16Bytes = utf16source.Cast<char, byte>();
encodedBytes = 0;
for (int i = 0; i < utf16Bytes.Length; /* intentionally no increment */) {
UnicodeCodePoint codePoint;
int consumedBytes;
if (Utf16LittleEndianEncoder.TryDecodeCodePoint(utf16Bytes.Slice(i), out codePoint, out consumedBytes)) {
i += consumedBytes;
int justEncodedBytes;
if (TryEncodeCodePoint(codePoint, utf8Destination, out justEncodedBytes)) {
utf8Destination = utf8Destination.Slice(justEncodedBytes);
encodedBytes += justEncodedBytes;
}
else {
return false;
}
}
else {
throw new ArgumentOutOfRangeException(nameof(utf16source));
}
}
return true;
}

public static int ComputeEncodedBytes(ReadOnlySpan<char> utf16source)
{
Span<byte> utf8Destination;
unsafe {
byte* buffer = stackalloc byte[32];
utf8Destination = new Span<byte>(buffer, 32);
}
var utf16Bytes = utf16source.Cast<char, byte>();
int encodedBytes = 0;
for (int i = 0; i < utf16Bytes.Length; /* intentionally no increment */) {
UnicodeCodePoint codePoint;
int consumedBytes;
if (Utf16LittleEndianEncoder.TryDecodeCodePoint(utf16Bytes.Slice(i), out codePoint, out consumedBytes)) {
i += consumedBytes;
int justEncodedBytes;
if (TryEncodeCodePoint(codePoint, utf8Destination, out justEncodedBytes)) {
encodedBytes += justEncodedBytes;
}
else {
throw new NotImplementedException("this should resize the buffer");
}
}
else {
throw new ArgumentOutOfRangeException(nameof(utf16source));
}
}
return encodedBytes;
}
#endregion
}
}
63 changes: 9 additions & 54 deletions src/System.Text.Utf8/System/Text/Utf8/Utf8String.cs
Original file line number Diff line number Diff line change
Expand Up @@ -607,64 +607,19 @@ private static int GetUtf8LengthInBytes(IEnumerable<UnicodeCodePoint> codePoints
}

// TODO: This should return Utf16CodeUnits which should wrap byte[]/Span<byte>, same for other encoders
private static byte[] GetUtf8BytesFromString(string s)
private static byte[] GetUtf8BytesFromString(string str)
{
int len = 0;
for (int i = 0; i < s.Length; /* intentionally no increment */)
{
UnicodeCodePoint codePoint;
int encodedChars;
if (!Utf16LittleEndianEncoder.TryDecodeCodePointFromString(s, i, out codePoint, out encodedChars))
{
throw new ArgumentException("s", "Invalid surrogate pair in the string.");
}
ReadOnlySpan<char> characters = str.Slice();
int utf8Length = Utf8Encoder.ComputeEncodedBytes(characters);

if (encodedChars <= 0)
{
// TODO: Fix exception type
throw new Exception("internal error");
}

int encodedBytes = Utf8Encoder.GetNumberOfEncodedBytes(codePoint);
if (encodedBytes == 0)
{
// TODO: Fix exception type
throw new Exception("Internal error: Utf16Decoder somehow got CodePoint out of range");
}
len += encodedBytes;

i += encodedChars;
}
byte[] utf8Buffer = new byte[utf8Length];

byte[] bytes = new byte[len];

var p = new Span<byte>(bytes);
for (int i = 0; i < s.Length; /* intentionally no increment */)
{
UnicodeCodePoint codePoint;
int encodedChars;
if (Utf16LittleEndianEncoder.TryDecodeCodePointFromString(s, i, out codePoint, out encodedChars))
{
i += encodedChars;
int encodedBytes;
if (Utf8Encoder.TryEncodeCodePoint(codePoint, p, out encodedBytes))
{
p = p.Slice(encodedBytes);
}
else
{
// TODO: Fix exception type
throw new Exception("Internal error: Utf16Decoder somehow got CodePoint out of range or the buffer is too small");
}
}
else
{
// TODO: Fix exception type
throw new Exception("Internal error: we did pre-validation of the string, nothing should go wrong");
}
int encodedBytes;
if(!Utf8Encoder.TryEncode(characters, utf8Buffer, out encodedBytes)) {
throw new Exception(); // this should not happen
}

return bytes;
return utf8Buffer;
}

public Utf8String TrimStart()
Expand Down