Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 1 addition & 11 deletions src/Microsoft.ML.Tokenizers/PreTokenizer/Roberta.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;

namespace Microsoft.ML.Tokenizers
{
Expand All @@ -18,15 +17,6 @@ public sealed partial class RobertaPreTokenizer : PreTokenizer
/// </summary>
public static RobertaPreTokenizer Instance { get; } = new RobertaPreTokenizer();

private const string PretokenizePattern = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
#if NET7_0_OR_GREATER
[GeneratedRegex(PretokenizePattern)]
private static partial Regex PretokenizeRegex();
#else
private static readonly Regex _regex = new Regex(PretokenizePattern, RegexOptions.Compiled);
private static Regex PretokenizeRegex() => _regex;
#endif

/// <summary>
/// Splits the given string in multiple substrings at the word boundary, keeping track of the offsets of said substrings from the original string.
/// </summary>
Expand All @@ -40,7 +30,7 @@ public override IEnumerable<Split> PreTokenize(string sentence, bool skipSpecial
return Array.Empty<Split>();
}

return SplitSentence(sentence, PretokenizeRegex());
return SplitSentence(sentence, Tokenizer.P50kBaseRegex());
}
}
}
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Tokenizers/PreTokenizer/Whitespace.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public sealed partial class WhiteSpace : PreTokenizer
/// </summary>
public static WhiteSpace Instance { get; } = new WhiteSpace();

private const string PretokenizePattern = @"\w+|[^\w\s]+";
private const string PretokenizePattern = /*lang=regex*/ @"\w+|[^\w\s]+";
#if NET7_0_OR_GREATER
[GeneratedRegex(PretokenizePattern)]
private static partial Regex PretokenizeRegex();
Expand Down
10 changes: 6 additions & 4 deletions src/Microsoft.ML.Tokenizers/Tokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -384,8 +384,10 @@ public static Task<Tokenizer> CreateByModelNameAsync(
}
}

private const string Cl100kBaseRegexPattern = @"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+";
private const string P50kBaseRegexPattern = @"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
// Regex patterns based on https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py

private const string Cl100kBaseRegexPattern = /*lang=regex*/ @"'(?i:[sdmt]|re|ve|ll)|(?>[^\r\n\p{L}\p{N}]?)\p{L}+|\p{N}{1,3}| ?(?>[^\s\p{L}\p{N}]+)[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+";
private const string P50kBaseRegexPattern = /*lang=regex*/ @"'(?:[sdmt]|re|ve|ll)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";

private const string Cl100kBaseVocabUrl = @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken";
private const string P50RanksUrl = @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken";
Expand All @@ -397,13 +399,13 @@ public static Task<Tokenizer> CreateByModelNameAsync(
private static partial Regex Cl100kBaseRegex();

[GeneratedRegex(P50kBaseRegexPattern)]
private static partial Regex P50kBaseRegex();
internal static partial Regex P50kBaseRegex();
#else
private static Regex? _cl100kBaseRegex;
private static Regex Cl100kBaseRegex() => _cl100kBaseRegex ??= new Regex(Cl100kBaseRegexPattern, RegexOptions.Compiled);

private static Regex? _p50kBaseRegex;
private static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled);
internal static Regex P50kBaseRegex() => _p50kBaseRegex ??= new Regex(P50kBaseRegexPattern, RegexOptions.Compiled);
#endif

/// <summary>
Expand Down