dotnet · tarekgh · Feb 23, 2024 · Feb 20, 2024 · Feb 22, 2024 · Feb 22, 2024
diff --git a/src/Microsoft.ML.Tokenizers/PreTokenizer/Roberta.cs b/src/Microsoft.ML.Tokenizers/PreTokenizer/Roberta.cs
@@ -4,7 +4,6 @@
 
 using System;
 using System.Collections.Generic;
-using System.Text.RegularExpressions;
 
 namespace Microsoft.ML.Tokenizers
 {
@@ -18,15 +17,6 @@ public sealed partial class RobertaPreTokenizer : PreTokenizer
         /// </summary>
         public static RobertaPreTokenizer Instance { get; } = new RobertaPreTokenizer();
 
-        private const string PretokenizePattern = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
-#if NET7_0_OR_GREATER
-        [GeneratedRegex(PretokenizePattern)]
-        private static partial Regex PretokenizeRegex();
-#else
-        private static readonly Regex _regex = new Regex(PretokenizePattern, RegexOptions.Compiled);
-        private static Regex PretokenizeRegex() => _regex;
-#endif
-
         /// <summary>
         /// Splits the given string in multiple substrings at the word boundary, keeping track of the offsets of said substrings from the original string.
         /// </summary>
@@ -40,7 +30,7 @@ public override IEnumerable<Split> PreTokenize(string sentence, bool skipSpecial
                 return Array.Empty<Split>();
             }
 
-            return SplitSentence(sentence, PretokenizeRegex());
+            return SplitSentence(sentence, Tokenizer.P50kBaseRegex());
         }
     }
 }
diff --git a/src/Microsoft.ML.Tokenizers/PreTokenizer/Whitespace.cs b/src/Microsoft.ML.Tokenizers/PreTokenizer/Whitespace.cs
@@ -19,7 +19,7 @@ public sealed partial class WhiteSpace : PreTokenizer
         /// </summary>
         public static WhiteSpace Instance { get; } = new WhiteSpace();
 
-        private const string PretokenizePattern = @"\w+|[^\w\s]+";
+        private const string PretokenizePattern = /*lang=regex*/ @"\w+|[^\w\s]+";
 #if NET7_0_OR_GREATER
         [GeneratedRegex(PretokenizePattern)]
         private static partial Regex PretokenizeRegex();

diff --git a/src/Microsoft.ML.Tokenizers/Tokenizer.cs b/src/Microsoft.ML.Tokenizers/Tokenizer.cs
@@ -384,8 +384,10 @@ public static Task<Tokenizer> CreateByModelNameAsync(
             }
         }
 
-        private const string Cl100kBaseRegexPattern = @"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+";
-        private const string P50kBaseRegexPattern = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
+        // Regex patterns based on https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py
+
+        private const string Cl100kBaseRegexPattern = /*lang=regex*/ @"'(?i:[sdmt]|re|ve|ll)|(?>[^\r\n\p{L}\p{N}]?)\p{L}+|\p{N}{1,3}| ?(?>[^\s\p{L}\p{N}]+)[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+";
+        private const string P50kBaseRegexPattern = /*lang=regex*/ @"'(?:[sdmt]|re|ve|ll)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
 
         private const string Cl100kBaseVocabUrl = @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken";
         private const string P50RanksUrl = @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken";
@@ -397,13 +399,13 @@ public static Task<Tokenizer> CreateByModelNameAsync(
         private static partial Regex Cl100kBaseRegex();
 
         [GeneratedRegex(P50kBaseRegexPattern)]
-        private static partial Regex P50kBaseRegex();
+        internal static partial Regex P50kBaseRegex();
 #else
         private static Regex? _cl100kBaseRegex;
         private static Regex Cl100kBaseRegex() => _cl100kBaseRegex ??= new Regex(Cl100kBaseRegexPattern, RegexOptions.Compiled);
 
         private static Regex? _p50kBaseRegex;
-        private static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled);
+        internal static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled);
 #endif
 
         /// <summary>