Skip to content
This repository has been archived by the owner on Aug 2, 2023. It is now read-only.

Commit

Permalink
Started to refactor Utf8 encoder
Browse files Browse the repository at this point in the history
  • Loading branch information
KrzysztofCwalina committed Nov 15, 2016
1 parent d0b33e6 commit af1ed5d
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 54 deletions.
56 changes: 56 additions & 0 deletions src/System.Text.Utf8/System/Text/Utf8/Utf8Encoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// Licensed under the MIT license. See LICENSE file in the project root for full license information.

using System.Runtime.CompilerServices;
using System.Text.Utf16;

namespace System.Text.Utf8
{
Expand Down Expand Up @@ -243,6 +244,61 @@ public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span<byte> buf
return false;
}
}

// TODO: this routime needs to be optimized.
public static bool TryEncode(ReadOnlySpan<char> utf16source, Span<byte> utf8Destination, out int encodedBytes)
{
var utf16Bytes = utf16source.Cast<char, byte>();
encodedBytes = 0;
for (int i = 0; i < utf16Bytes.Length; /* intentionally no increment */) {
UnicodeCodePoint codePoint;
int consumedBytes;
if (Utf16LittleEndianEncoder.TryDecodeCodePoint(utf16Bytes.Slice(i), out codePoint, out consumedBytes)) {
i += consumedBytes;
int justEncodedBytes;
if (TryEncodeCodePoint(codePoint, utf8Destination, out justEncodedBytes)) {
utf8Destination = utf8Destination.Slice(justEncodedBytes);
encodedBytes += justEncodedBytes;
}
else {
return false;
}
}
else {
throw new ArgumentOutOfRangeException(nameof(utf16source));
}
}
return true;
}

public static int ComputeEncodedBytes(ReadOnlySpan<char> utf16source)
{
Span<byte> utf8Destination;
unsafe {
byte* buffer = stackalloc byte[32];
utf8Destination = new Span<byte>(buffer, 32);
}
var utf16Bytes = utf16source.Cast<char, byte>();
int encodedBytes = 0;
for (int i = 0; i < utf16Bytes.Length; /* intentionally no increment */) {
UnicodeCodePoint codePoint;
int consumedBytes;
if (Utf16LittleEndianEncoder.TryDecodeCodePoint(utf16Bytes.Slice(i), out codePoint, out consumedBytes)) {
i += consumedBytes;
int justEncodedBytes;
if (TryEncodeCodePoint(codePoint, utf8Destination, out justEncodedBytes)) {
encodedBytes += justEncodedBytes;
}
else {
throw new NotImplementedException("this should resize the buffer");
}
}
else {
throw new ArgumentOutOfRangeException(nameof(utf16source));
}
}
return encodedBytes;
}
#endregion
}
}
63 changes: 9 additions & 54 deletions src/System.Text.Utf8/System/Text/Utf8/Utf8String.cs
Original file line number Diff line number Diff line change
Expand Up @@ -607,64 +607,19 @@ private static int GetUtf8LengthInBytes(IEnumerable<UnicodeCodePoint> codePoints
}

// TODO: This should return Utf16CodeUnits which should wrap byte[]/Span<byte>, same for other encoders
private static byte[] GetUtf8BytesFromString(string s)
private static byte[] GetUtf8BytesFromString(string str)
{
int len = 0;
for (int i = 0; i < s.Length; /* intentionally no increment */)
{
UnicodeCodePoint codePoint;
int encodedChars;
if (!Utf16LittleEndianEncoder.TryDecodeCodePointFromString(s, i, out codePoint, out encodedChars))
{
throw new ArgumentException("s", "Invalid surrogate pair in the string.");
}
ReadOnlySpan<char> characters = str.Slice();
int utf8Length = Utf8Encoder.ComputeEncodedBytes(characters);

if (encodedChars <= 0)
{
// TODO: Fix exception type
throw new Exception("internal error");
}

int encodedBytes = Utf8Encoder.GetNumberOfEncodedBytes(codePoint);
if (encodedBytes == 0)
{
// TODO: Fix exception type
throw new Exception("Internal error: Utf16Decoder somehow got CodePoint out of range");
}
len += encodedBytes;

i += encodedChars;
}
byte[] utf8Buffer = new byte[utf8Length];

byte[] bytes = new byte[len];

var p = new Span<byte>(bytes);
for (int i = 0; i < s.Length; /* intentionally no increment */)
{
UnicodeCodePoint codePoint;
int encodedChars;
if (Utf16LittleEndianEncoder.TryDecodeCodePointFromString(s, i, out codePoint, out encodedChars))
{
i += encodedChars;
int encodedBytes;
if (Utf8Encoder.TryEncodeCodePoint(codePoint, p, out encodedBytes))
{
p = p.Slice(encodedBytes);
}
else
{
// TODO: Fix exception type
throw new Exception("Internal error: Utf16Decoder somehow got CodePoint out of range or the buffer is too small");
}
}
else
{
// TODO: Fix exception type
throw new Exception("Internal error: we did pre-validation of the string, nothing should go wrong");
}
int encodedBytes;
if(!Utf8Encoder.TryEncode(characters, utf8Buffer, out encodedBytes)) {
throw new Exception(); // this should not happen
}

return bytes;
return utf8Buffer;
}

public Utf8String TrimStart()
Expand Down

0 comments on commit af1ed5d

Please sign in to comment.