diff --git a/src/Microsoft.ML.Tokenizers/PreTokenizer/Roberta.cs b/src/Microsoft.ML.Tokenizers/PreTokenizer/Roberta.cs
index 8fd748d838..a73845e440 100644
--- a/src/Microsoft.ML.Tokenizers/PreTokenizer/Roberta.cs
+++ b/src/Microsoft.ML.Tokenizers/PreTokenizer/Roberta.cs
@@ -4,7 +4,6 @@
using System;
using System.Collections.Generic;
-using System.Text.RegularExpressions;
namespace Microsoft.ML.Tokenizers
{
@@ -18,15 +17,6 @@ public sealed partial class RobertaPreTokenizer : PreTokenizer
///
public static RobertaPreTokenizer Instance { get; } = new RobertaPreTokenizer();
- private const string PretokenizePattern = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
-#if NET7_0_OR_GREATER
- [GeneratedRegex(PretokenizePattern)]
- private static partial Regex PretokenizeRegex();
-#else
- private static readonly Regex _regex = new Regex(PretokenizePattern, RegexOptions.Compiled);
- private static Regex PretokenizeRegex() => _regex;
-#endif
-
///
/// Splits the given string in multiple substrings at the word boundary, keeping track of the offsets of said substrings from the original string.
///
@@ -40,7 +30,7 @@ public override IEnumerable PreTokenize(string sentence, bool skipSpecial
return Array.Empty();
}
- return SplitSentence(sentence, PretokenizeRegex());
+ return SplitSentence(sentence, Tokenizer.P50kBaseRegex());
}
}
}
diff --git a/src/Microsoft.ML.Tokenizers/PreTokenizer/Whitespace.cs b/src/Microsoft.ML.Tokenizers/PreTokenizer/Whitespace.cs
index 2a53bec814..2d65aa1c57 100644
--- a/src/Microsoft.ML.Tokenizers/PreTokenizer/Whitespace.cs
+++ b/src/Microsoft.ML.Tokenizers/PreTokenizer/Whitespace.cs
@@ -19,7 +19,7 @@ public sealed partial class WhiteSpace : PreTokenizer
///
public static WhiteSpace Instance { get; } = new WhiteSpace();
- private const string PretokenizePattern = @"\w+|[^\w\s]+";
+ private const string PretokenizePattern = /*lang=regex*/ @"\w+|[^\w\s]+";
#if NET7_0_OR_GREATER
[GeneratedRegex(PretokenizePattern)]
private static partial Regex PretokenizeRegex();
diff --git a/src/Microsoft.ML.Tokenizers/Tokenizer.cs b/src/Microsoft.ML.Tokenizers/Tokenizer.cs
index fb4a5857be..46a419083f 100644
--- a/src/Microsoft.ML.Tokenizers/Tokenizer.cs
+++ b/src/Microsoft.ML.Tokenizers/Tokenizer.cs
@@ -384,8 +384,10 @@ public static Task CreateByModelNameAsync(
}
}
- private const string Cl100kBaseRegexPattern = @"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+";
- private const string P50kBaseRegexPattern = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
+ // Regex patterns based on https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py
+
+ private const string Cl100kBaseRegexPattern = /*lang=regex*/ @"'(?i:[sdmt]|re|ve|ll)|(?>[^\r\n\p{L}\p{N}]?)\p{L}+|\p{N}{1,3}| ?(?>[^\s\p{L}\p{N}]+)[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+";
+ private const string P50kBaseRegexPattern = /*lang=regex*/ @"'(?:[sdmt]|re|ve|ll)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
private const string Cl100kBaseVocabUrl = @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken";
private const string P50RanksUrl = @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken";
@@ -397,13 +399,13 @@ public static Task CreateByModelNameAsync(
private static partial Regex Cl100kBaseRegex();
[GeneratedRegex(P50kBaseRegexPattern)]
- private static partial Regex P50kBaseRegex();
+ internal static partial Regex P50kBaseRegex();
#else
private static Regex? _cl100kBaseRegex;
private static Regex Cl100kBaseRegex() => _cl100kBaseRegex ??= new Regex(Cl100kBaseRegexPattern, RegexOptions.Compiled);
private static Regex? _p50kBaseRegex;
- private static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled);
+ internal static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled);
#endif
///