diff --git a/src/Microsoft.ML.Tokenizers/PreTokenizer/Roberta.cs b/src/Microsoft.ML.Tokenizers/PreTokenizer/Roberta.cs index 8fd748d838..a73845e440 100644 --- a/src/Microsoft.ML.Tokenizers/PreTokenizer/Roberta.cs +++ b/src/Microsoft.ML.Tokenizers/PreTokenizer/Roberta.cs @@ -4,7 +4,6 @@ using System; using System.Collections.Generic; -using System.Text.RegularExpressions; namespace Microsoft.ML.Tokenizers { @@ -18,15 +17,6 @@ public sealed partial class RobertaPreTokenizer : PreTokenizer /// public static RobertaPreTokenizer Instance { get; } = new RobertaPreTokenizer(); - private const string PretokenizePattern = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"; -#if NET7_0_OR_GREATER - [GeneratedRegex(PretokenizePattern)] - private static partial Regex PretokenizeRegex(); -#else - private static readonly Regex _regex = new Regex(PretokenizePattern, RegexOptions.Compiled); - private static Regex PretokenizeRegex() => _regex; -#endif - /// /// Splits the given string in multiple substrings at the word boundary, keeping track of the offsets of said substrings from the original string. /// @@ -40,7 +30,7 @@ public override IEnumerable PreTokenize(string sentence, bool skipSpecial return Array.Empty(); } - return SplitSentence(sentence, PretokenizeRegex()); + return SplitSentence(sentence, Tokenizer.P50kBaseRegex()); } } } diff --git a/src/Microsoft.ML.Tokenizers/PreTokenizer/Whitespace.cs b/src/Microsoft.ML.Tokenizers/PreTokenizer/Whitespace.cs index 2a53bec814..2d65aa1c57 100644 --- a/src/Microsoft.ML.Tokenizers/PreTokenizer/Whitespace.cs +++ b/src/Microsoft.ML.Tokenizers/PreTokenizer/Whitespace.cs @@ -19,7 +19,7 @@ public sealed partial class WhiteSpace : PreTokenizer /// public static WhiteSpace Instance { get; } = new WhiteSpace(); - private const string PretokenizePattern = @"\w+|[^\w\s]+"; + private const string PretokenizePattern = /*lang=regex*/ @"\w+|[^\w\s]+"; #if NET7_0_OR_GREATER [GeneratedRegex(PretokenizePattern)] private static partial Regex PretokenizeRegex(); diff --git a/src/Microsoft.ML.Tokenizers/Tokenizer.cs b/src/Microsoft.ML.Tokenizers/Tokenizer.cs index fb4a5857be..46a419083f 100644 --- a/src/Microsoft.ML.Tokenizers/Tokenizer.cs +++ b/src/Microsoft.ML.Tokenizers/Tokenizer.cs @@ -384,8 +384,10 @@ public static Task CreateByModelNameAsync( } } - private const string Cl100kBaseRegexPattern = @"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"; - private const string P50kBaseRegexPattern = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"; + // Regex patterns based on https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py + + private const string Cl100kBaseRegexPattern = /*lang=regex*/ @"'(?i:[sdmt]|re|ve|ll)|(?>[^\r\n\p{L}\p{N}]?)\p{L}+|\p{N}{1,3}| ?(?>[^\s\p{L}\p{N}]+)[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"; + private const string P50kBaseRegexPattern = /*lang=regex*/ @"'(?:[sdmt]|re|ve|ll)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"; private const string Cl100kBaseVocabUrl = @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"; private const string P50RanksUrl = @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"; @@ -397,13 +399,13 @@ public static Task CreateByModelNameAsync( private static partial Regex Cl100kBaseRegex(); [GeneratedRegex(P50kBaseRegexPattern)] - private static partial Regex P50kBaseRegex(); + internal static partial Regex P50kBaseRegex(); #else private static Regex? _cl100kBaseRegex; private static Regex Cl100kBaseRegex() => _cl100kBaseRegex ??= new Regex(Cl100kBaseRegexPattern, RegexOptions.Compiled); private static Regex? _p50kBaseRegex; - private static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled); + internal static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled); #endif ///