Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
f6e32f5
Fix cache when calling EncodeToIds
tarekgh Feb 17, 2024
0553922
Make EnglishRoberta _mergeRanks thread safe
tarekgh Feb 17, 2024
a4cb1f5
Delete Trainer
tarekgh Feb 19, 2024
6a13025
Remove the setters on the Bpe properties
tarekgh Feb 19, 2024
3278aff
Remove Roberta and Tiktoken special casing in the Tokenizer and suppo…
tarekgh Feb 19, 2024
b5f7fa2
Support text-embedding-3-small/large embedding
tarekgh Feb 19, 2024
a11f4e0
Remove redundant TokenToId abstraction and keep the one with the extr…
tarekgh Feb 19, 2024
865068a
Enable creating Tiktoken asynchronously or directly using the tokeniz…
tarekgh Feb 20, 2024
4077de0
Add cancellationToken support in CreateAsync APIs
tarekgh Feb 21, 2024
5aaf849
Rename sequence to text and Tokenize to Encode
tarekgh Feb 21, 2024
b5e0927
Rename skipSpecialTokens to considerSpecialTokens
tarekgh Feb 21, 2024
5e26b3e
Rename TokenizerResult to EncodingResult
tarekgh Feb 21, 2024
985de8a
Make Token publicly immutable
tarekgh Feb 21, 2024
b551e7d
Change offset tuples from (Index, End) to (Index, Length)
tarekgh Feb 21, 2024
7ea7f70
Rename NormalizedString method's parameters
tarekgh Feb 21, 2024
b0c8244
Rename Model's methods to start with verb
tarekgh Feb 21, 2024
450418a
Convert Model.GetVocab() method to a Vocab property
tarekgh Feb 21, 2024
6f53de8
Some method's parameters and variable renaming
tarekgh Feb 22, 2024
62334c6
Remove Vocab and VocabSize from the abstraction
tarekgh Feb 22, 2024
d48b32d
Cleanup normalization support
tarekgh Feb 22, 2024
191ab03
Minor Bpe cleanup
tarekgh Feb 22, 2024
b9b0f58
Resolve rebase change
tarekgh Feb 23, 2024
1ad157f
Address the feedback
tarekgh Feb 25, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
241 changes: 118 additions & 123 deletions src/Microsoft.ML.Tokenizers/Model/BPE.cs

Large diffs are not rendered by default.

534 changes: 0 additions & 534 deletions src/Microsoft.ML.Tokenizers/Model/BpeTrainer.cs

This file was deleted.

36 changes: 36 additions & 0 deletions src/Microsoft.ML.Tokenizers/Model/Cache.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;

Expand Down Expand Up @@ -95,5 +96,40 @@ internal void Set(TKey k, TValue v)
}
finally { _cacheLock.ExitWriteLock(); }
}

internal KeyValuePair<TKey, TValue>[] ToArray()
{
_cacheLock.EnterReadLock();
try
{
return Map.ToArray();
}
finally { _cacheLock.ExitReadLock(); }
}

internal TValue GetOrAdd(TKey key, TValue value)
{
_cacheLock.EnterUpgradeableReadLock();
try
{
if (Map.TryGetValue(key, out TValue? v))
{
return v;
}

_cacheLock.EnterWriteLock();
try
{
if (Capacity > Map.Count)
{
Map[key] = value;
}
}
finally { _cacheLock.ExitWriteLock(); }

return value;
}
finally { _cacheLock.ExitUpgradeableReadLock(); }
}
}
}
312 changes: 67 additions & 245 deletions src/Microsoft.ML.Tokenizers/Model/EnglishRoberta.cs

Large diffs are not rendered by default.

81 changes: 41 additions & 40 deletions src/Microsoft.ML.Tokenizers/Model/Model.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,98 +14,99 @@ namespace Microsoft.ML.Tokenizers
public abstract class Model
{
/// <summary>
/// Tokenize a split sequence string to a list of tokens.
/// Encode a split text string to a list of tokens.
/// </summary>
/// <param name="sequence">The text to tokenize.</param>
/// <param name="text">The text to encode.</param>
/// <param name="isSpecialToken">Indicate if the token is a special token.</param>
/// <returns>The list of tokens generated from the sequence tokenization.</returns>
public abstract IReadOnlyList<Token> Tokenize(string sequence, bool isSpecialToken = false);
/// <returns>The list of tokens generated from the text tokenization.</returns>
public abstract IReadOnlyList<Token> Encode(string text, bool isSpecialToken = false);

/// <summary>
/// Tokenize a split sequence string to a list of Ids and add them to the accumulatedIds list.
/// Encode a split text string to a list of Ids and add them to the accumulatedIds list.
/// </summary>
/// <param name="sequence">The sequence to split.</param>
/// <param name="text">The text to split.</param>
/// <param name="isSpecialToken">Indicate if the token is a special token.</param>
/// <param name="accumulatedIds">The list of accumulated tokenized Ids.</param>
/// <param name="accumulatedIds">The list of accumulated encoded Ids.</param>
/// <remarks>
/// This method does the default implementation that uses the Tokenize method to get the token's Ids.
/// This method does the default implementation that uses the Encode method to get the token's Ids.
/// Tokenizer's models which care about performance may choose to override this method to provide a more efficient implementation.
/// </remarks>
public virtual void TokenizeToIds(string sequence, bool isSpecialToken, IList<int> accumulatedIds)
public virtual void EncodeToIds(string text, bool isSpecialToken, IList<int> accumulatedIds)
{
if (accumulatedIds is null)
{
throw new ArgumentNullException(nameof(accumulatedIds));
}

var tokens = Tokenize(sequence);
var tokens = Encode(text);
foreach (var token in tokens)
{
accumulatedIds.Add(token.Id);
}
}

/// <summary>
/// Get the number of tokens that the input sequence will be encoded to.
/// Get the number of tokens that the input text will be encoded to.
/// </summary>
/// <param name="sequence">The text to tokenize.</param>
/// <param name="text">The text to encode.</param>
/// <param name="isSpecialToken">Indicate if the token is special token.</param>
/// <returns>The number of tokens that the input sequence will be encoded to.</returns>
/// <returns>The number of tokens that the input text will be encoded to.</returns>
/// <remarks>
/// This method does the default implementation that uses the TokenizeToIds method to get the number of token's Ids.
/// This method does the default implementation that uses the EncodeToIds method to get the number of token's Ids.
/// Tokenizer's models which care about performance may choose to override this method to provide a more efficient implementation.
/// </remarks>
public virtual int CountTokens(string sequence, bool isSpecialToken)
public virtual int CountTokens(string text, bool isSpecialToken)
{
var ids = new List<int>();
TokenizeToIds(sequence, isSpecialToken, ids);
EncodeToIds(text, isSpecialToken, ids);
return ids.Count;
}

/// <summary>
/// Map the token to tokenized Id.
/// </summary>
/// <param name="token">The token to map to the Id.</param>
/// <returns>The mapped Id of the token.</returns>
public abstract int? TokenToId(string token);

/// <summary>
/// Map the token to tokenized id with the option to skip the special tokens.
/// Map the token to encoded id with the option to skip the special tokens.
/// </summary>
/// <param name="token">The token to map to Id</param>
/// <param name="skipSpecialTokens">Indicate if want to skip the special tokens during the encoding.</param>
/// <returns>The mapped Id of the token.</returns>
public virtual int? TokenToId(string token, bool skipSpecialTokens) => TokenToId(token);
public abstract int? TokenToId(string token, bool skipSpecialTokens = false);

/// <summary>
/// Map the tokenized Id to the token.
/// Map the encoded Id to the token.
/// </summary>
/// <param name="id">The Id to map to the token.</param>
/// <param name="skipSpecialTokens">Indicate if want to skip the special tokens during the decoding.</param>
/// <param name="filterUnsupportedChars">Indicate if want to filter the unsupported characters during the decoding.</param>
/// <returns>The mapped token of the Id.</returns>
public abstract string? IdToToken(int id, bool skipSpecialTokens = false);
public abstract string? IdToToken(int id, bool skipSpecialTokens = false, bool filterUnsupportedChars = true);

/// <summary>
/// Gets the dictionary mapping tokens to Ids.
/// Decode the given ids, back to a String.
/// </summary>
public abstract IReadOnlyDictionary<string, int> GetVocab();
/// <param name="ids">The list of ids that we want to decode.</param>
/// <param name="skipSpecialTokens">Whether the special tokens should be removed from the decoded string.</param>
/// <param name="filterUnsupportedChars">Indicate if want to filter the unsupported characters during the decoding.</param>
/// <param name="decoder">The optional Decoder to merge the given list of tokens in a string.</param>
/// <returns>The decoded string.</returns>
public virtual string? Decode(IEnumerable<int> ids, TokenizerDecoder? decoder = null, bool skipSpecialTokens = false, bool filterUnsupportedChars = true)
{
List<string> tokens = new List<string>();

/// <summary>
/// Gets the dictionary size that map tokens to Ids.
/// </summary>
public abstract int GetVocabSize();
foreach (int id in ids)
{
tokens.Add(IdToToken(id, skipSpecialTokens, filterUnsupportedChars) ?? "");
}

return decoder?.Decode(tokens) ?? string.Join("", tokens);
}

/// <summary>
/// Save the model data into the vocabulary and merges files.
/// Gets the dictionary mapping tokens to Ids.
/// </summary>
/// <param name="path">The file system path to store the generated files at.</param>
/// <param name="prefix">Optional prefix for the generated file names.</param>
/// <returns>The list of all saved files.</returns>
public abstract string[] Save(string path, string? prefix = null);
public abstract IReadOnlyDictionary<string, int> GetVocab();

/// <summary>
/// Gets a trainer object to use in training the model.
/// Gets the dictionary size that map tokens to Ids.
/// </summary>
public abstract Trainer? GetTrainer();
public abstract int GetVocabSize();
}
}
64 changes: 0 additions & 64 deletions src/Microsoft.ML.Tokenizers/Model/Progress.cs

This file was deleted.

Loading