From af1ed5d4b150473f9e50f3abdcd7eaa473c0acc3 Mon Sep 17 00:00:00 2001 From: Krzysztof Cwalina Date: Mon, 14 Nov 2016 17:32:28 -0800 Subject: [PATCH] Started to refactor Utf8 encoder --- .../System/Text/Utf8/Utf8Encoder.cs | 56 +++++++++++++++++ .../System/Text/Utf8/Utf8String.cs | 63 +++---------------- 2 files changed, 65 insertions(+), 54 deletions(-) diff --git a/src/System.Text.Utf8/System/Text/Utf8/Utf8Encoder.cs b/src/System.Text.Utf8/System/Text/Utf8/Utf8Encoder.cs index 3cbc164817e..6a0e015e48c 100644 --- a/src/System.Text.Utf8/System/Text/Utf8/Utf8Encoder.cs +++ b/src/System.Text.Utf8/System/Text/Utf8/Utf8Encoder.cs @@ -2,6 +2,7 @@ // Licensed under the MIT license. See LICENSE file in the project root for full license information. using System.Runtime.CompilerServices; +using System.Text.Utf16; namespace System.Text.Utf8 { @@ -243,6 +244,61 @@ public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span buf return false; } } + + // TODO: this routime needs to be optimized. + public static bool TryEncode(ReadOnlySpan utf16source, Span utf8Destination, out int encodedBytes) + { + var utf16Bytes = utf16source.Cast(); + encodedBytes = 0; + for (int i = 0; i < utf16Bytes.Length; /* intentionally no increment */) { + UnicodeCodePoint codePoint; + int consumedBytes; + if (Utf16LittleEndianEncoder.TryDecodeCodePoint(utf16Bytes.Slice(i), out codePoint, out consumedBytes)) { + i += consumedBytes; + int justEncodedBytes; + if (TryEncodeCodePoint(codePoint, utf8Destination, out justEncodedBytes)) { + utf8Destination = utf8Destination.Slice(justEncodedBytes); + encodedBytes += justEncodedBytes; + } + else { + return false; + } + } + else { + throw new ArgumentOutOfRangeException(nameof(utf16source)); + } + } + return true; + } + + public static int ComputeEncodedBytes(ReadOnlySpan utf16source) + { + Span utf8Destination; + unsafe { + byte* buffer = stackalloc byte[32]; + utf8Destination = new Span(buffer, 32); + } + var utf16Bytes = utf16source.Cast(); + int encodedBytes = 0; + for (int i = 0; i < utf16Bytes.Length; /* intentionally no increment */) { + UnicodeCodePoint codePoint; + int consumedBytes; + if (Utf16LittleEndianEncoder.TryDecodeCodePoint(utf16Bytes.Slice(i), out codePoint, out consumedBytes)) { + i += consumedBytes; + int justEncodedBytes; + if (TryEncodeCodePoint(codePoint, utf8Destination, out justEncodedBytes)) { + encodedBytes += justEncodedBytes; + } + else { + throw new NotImplementedException("this should resize the buffer"); + } + } + else { + throw new ArgumentOutOfRangeException(nameof(utf16source)); + } + } + return encodedBytes; + } #endregion } } diff --git a/src/System.Text.Utf8/System/Text/Utf8/Utf8String.cs b/src/System.Text.Utf8/System/Text/Utf8/Utf8String.cs index 6882e8f5f35..b17d3f583c2 100644 --- a/src/System.Text.Utf8/System/Text/Utf8/Utf8String.cs +++ b/src/System.Text.Utf8/System/Text/Utf8/Utf8String.cs @@ -607,64 +607,19 @@ private static int GetUtf8LengthInBytes(IEnumerable codePoints } // TODO: This should return Utf16CodeUnits which should wrap byte[]/Span, same for other encoders - private static byte[] GetUtf8BytesFromString(string s) + private static byte[] GetUtf8BytesFromString(string str) { - int len = 0; - for (int i = 0; i < s.Length; /* intentionally no increment */) - { - UnicodeCodePoint codePoint; - int encodedChars; - if (!Utf16LittleEndianEncoder.TryDecodeCodePointFromString(s, i, out codePoint, out encodedChars)) - { - throw new ArgumentException("s", "Invalid surrogate pair in the string."); - } + ReadOnlySpan characters = str.Slice(); + int utf8Length = Utf8Encoder.ComputeEncodedBytes(characters); - if (encodedChars <= 0) - { - // TODO: Fix exception type - throw new Exception("internal error"); - } - - int encodedBytes = Utf8Encoder.GetNumberOfEncodedBytes(codePoint); - if (encodedBytes == 0) - { - // TODO: Fix exception type - throw new Exception("Internal error: Utf16Decoder somehow got CodePoint out of range"); - } - len += encodedBytes; - - i += encodedChars; - } + byte[] utf8Buffer = new byte[utf8Length]; - byte[] bytes = new byte[len]; - - var p = new Span(bytes); - for (int i = 0; i < s.Length; /* intentionally no increment */) - { - UnicodeCodePoint codePoint; - int encodedChars; - if (Utf16LittleEndianEncoder.TryDecodeCodePointFromString(s, i, out codePoint, out encodedChars)) - { - i += encodedChars; - int encodedBytes; - if (Utf8Encoder.TryEncodeCodePoint(codePoint, p, out encodedBytes)) - { - p = p.Slice(encodedBytes); - } - else - { - // TODO: Fix exception type - throw new Exception("Internal error: Utf16Decoder somehow got CodePoint out of range or the buffer is too small"); - } - } - else - { - // TODO: Fix exception type - throw new Exception("Internal error: we did pre-validation of the string, nothing should go wrong"); - } + int encodedBytes; + if(!Utf8Encoder.TryEncode(characters, utf8Buffer, out encodedBytes)) { + throw new Exception(); // this should not happen } - - return bytes; + + return utf8Buffer; } public Utf8String TrimStart()