Started to refactor Utf8 encoder

dotnet · Nov 15, 2016 · af1ed5d · af1ed5d
1 parent d0b33e6
commit af1ed5d
Show file tree

Hide file tree

Showing 2 changed files with 65 additions and 54 deletions.
diff --git a/src/System.Text.Utf8/System/Text/Utf8/Utf8Encoder.cs b/src/System.Text.Utf8/System/Text/Utf8/Utf8Encoder.cs
@@ -2,6 +2,7 @@
 // Licensed under the MIT license. See LICENSE file in the project root for full license information.
 
 using System.Runtime.CompilerServices;
+using System.Text.Utf16;
 
 namespace System.Text.Utf8
 {
@@ -243,6 +244,61 @@ public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span<byte> buf
                     return false;
             }
         }
+
+        // TODO: this routime needs to be optimized.
+        public static bool TryEncode(ReadOnlySpan<char> utf16source, Span<byte> utf8Destination, out int encodedBytes)
+        {
+            var utf16Bytes = utf16source.Cast<char, byte>();
+            encodedBytes = 0;
+            for (int i = 0; i < utf16Bytes.Length; /* intentionally no increment */) {
+                UnicodeCodePoint codePoint;
+                int consumedBytes;
+                if (Utf16LittleEndianEncoder.TryDecodeCodePoint(utf16Bytes.Slice(i), out codePoint, out consumedBytes)) {
+                    i += consumedBytes;
+                    int justEncodedBytes;
+                    if (TryEncodeCodePoint(codePoint, utf8Destination, out justEncodedBytes)) {
+                        utf8Destination = utf8Destination.Slice(justEncodedBytes);
+                        encodedBytes += justEncodedBytes;
+                    }
+                    else {
+                        return false;
+                    }
+                }
+                else {
+                    throw new ArgumentOutOfRangeException(nameof(utf16source));
+                }
+            }
+            return true;
+        }
+
+        public static int ComputeEncodedBytes(ReadOnlySpan<char> utf16source)
+        {
+            Span<byte> utf8Destination;
+            unsafe {
+                byte* buffer = stackalloc byte[32];
+                utf8Destination = new Span<byte>(buffer, 32);
+            }
+            var utf16Bytes = utf16source.Cast<char, byte>();
+            int encodedBytes = 0;
+            for (int i = 0; i < utf16Bytes.Length; /* intentionally no increment */) {
+                UnicodeCodePoint codePoint;
+                int consumedBytes;
+                if (Utf16LittleEndianEncoder.TryDecodeCodePoint(utf16Bytes.Slice(i), out codePoint, out consumedBytes)) {
+                    i += consumedBytes;
+                    int justEncodedBytes;
+                    if (TryEncodeCodePoint(codePoint, utf8Destination, out justEncodedBytes)) {
+                        encodedBytes += justEncodedBytes;
+                    }
+                    else {
+                        throw new NotImplementedException("this should resize the buffer");
+                    }
+                }
+                else {
+                    throw new ArgumentOutOfRangeException(nameof(utf16source));
+                }
+            }
+            return encodedBytes;
+        }
         #endregion
     }
 }
diff --git a/src/System.Text.Utf8/System/Text/Utf8/Utf8String.cs b/src/System.Text.Utf8/System/Text/Utf8/Utf8String.cs
@@ -607,64 +607,19 @@ private static int GetUtf8LengthInBytes(IEnumerable<UnicodeCodePoint> codePoints
         }
 
         // TODO: This should return Utf16CodeUnits which should wrap byte[]/Span<byte>, same for other encoders
-        private static byte[] GetUtf8BytesFromString(string s)
+        private static byte[] GetUtf8BytesFromString(string str)
         {
-            int len = 0;
-            for (int i = 0; i < s.Length; /* intentionally no increment */)
-            {
-                UnicodeCodePoint codePoint;
-                int encodedChars;
-                if (!Utf16LittleEndianEncoder.TryDecodeCodePointFromString(s, i, out codePoint, out encodedChars))
-                {
-                    throw new ArgumentException("s", "Invalid surrogate pair in the string.");
-                }
+            ReadOnlySpan<char> characters = str.Slice();
+            int utf8Length = Utf8Encoder.ComputeEncodedBytes(characters);
 
-                if (encodedChars <= 0)
-                {
-                    // TODO: Fix exception type
-                    throw new Exception("internal error");
-                }
-
-                int encodedBytes = Utf8Encoder.GetNumberOfEncodedBytes(codePoint);
-                if (encodedBytes == 0)
-                {
-                    // TODO: Fix exception type
-                    throw new Exception("Internal error: Utf16Decoder somehow got CodePoint out of range");
-                }
-                len += encodedBytes;
-
-                i += encodedChars;
-            }
+            byte[] utf8Buffer = new byte[utf8Length];
 
-            byte[] bytes = new byte[len];
-
-            var p = new Span<byte>(bytes);
-            for (int i = 0; i < s.Length; /* intentionally no increment */)
-            {
-                UnicodeCodePoint codePoint;
-                int encodedChars;
-                if (Utf16LittleEndianEncoder.TryDecodeCodePointFromString(s, i, out codePoint, out encodedChars))
-                {
-                    i += encodedChars;
-                    int encodedBytes;
-                    if (Utf8Encoder.TryEncodeCodePoint(codePoint, p, out encodedBytes))
-                    {
-                        p = p.Slice(encodedBytes);
-                    }
-                    else
-                    {
-                        // TODO: Fix exception type
-                        throw new Exception("Internal error: Utf16Decoder somehow got CodePoint out of range or the buffer is too small");
-                    }
-                }
-                else
-                {
-                    // TODO: Fix exception type
-                    throw new Exception("Internal error: we did pre-validation of the string, nothing should go wrong");
-                }
+            int encodedBytes;
+            if(!Utf8Encoder.TryEncode(characters, utf8Buffer, out encodedBytes)) {
+                throw new Exception(); // this should not happen
             }
-
-            return bytes;
+                    
+            return utf8Buffer;
         }
 
         public Utf8String TrimStart()