dotnet · tarekgh · Feb 26, 2024 · Feb 17, 2024 · Feb 17, 2024 · Feb 19, 2024
diff --git a/src/Microsoft.ML.Tokenizers/Model/BPE.cs b/src/Microsoft.ML.Tokenizers/Model/BPE.cs
diff --git a/src/Microsoft.ML.Tokenizers/Model/BpeTrainer.cs b/src/Microsoft.ML.Tokenizers/Model/BpeTrainer.cs
diff --git a/src/Microsoft.ML.Tokenizers/Model/Cache.cs b/src/Microsoft.ML.Tokenizers/Model/Cache.cs
@@ -4,6 +4,7 @@
 
 using System;
 using System.Collections.Generic;
+using System.Linq;
 using System.Text;
 using System.Threading;
 
@@ -95,5 +96,40 @@ internal void Set(TKey k, TValue v)
             }
             finally { _cacheLock.ExitWriteLock(); }
         }
+
+        internal KeyValuePair<TKey, TValue>[] ToArray()
+        {
+            _cacheLock.EnterReadLock();
+            try
+            {
+                return Map.ToArray();
+            }
+            finally { _cacheLock.ExitReadLock(); }
+        }
+
+        internal TValue GetOrAdd(TKey key, TValue value)
+        {
+            _cacheLock.EnterUpgradeableReadLock();
+            try
+            {
+                if (Map.TryGetValue(key, out TValue? v))
+                {
+                    return v;
+                }
+
+                _cacheLock.EnterWriteLock();
+                try
+                {
+                    if (Capacity > Map.Count)
+                    {
+                        Map[key] = value;
+                    }
+                }
+                finally { _cacheLock.ExitWriteLock(); }
+
+                return value;
+            }
+            finally { _cacheLock.ExitUpgradeableReadLock(); }
+        }
     }
 }
diff --git a/src/Microsoft.ML.Tokenizers/Model/EnglishRoberta.cs b/src/Microsoft.ML.Tokenizers/Model/EnglishRoberta.cs
diff --git a/src/Microsoft.ML.Tokenizers/Model/Model.cs b/src/Microsoft.ML.Tokenizers/Model/Model.cs
@@ -14,98 +14,99 @@ namespace Microsoft.ML.Tokenizers
     public abstract class Model
     {
         /// <summary>
-        /// Tokenize a split sequence string to a list of tokens.
+        /// Encode a split text string to a list of tokens.
         /// </summary>
-        /// <param name="sequence">The text to tokenize.</param>
+        /// <param name="text">The text to encode.</param>
         /// <param name="isSpecialToken">Indicate if the token is a special token.</param>
-        /// <returns>The list of tokens generated from the sequence tokenization.</returns>
-        public abstract IReadOnlyList<Token> Tokenize(string sequence, bool isSpecialToken = false);
+        /// <returns>The list of tokens generated from the text tokenization.</returns>
+        public abstract IReadOnlyList<Token> Encode(string text, bool isSpecialToken = false);
 
         /// <summary>
-        /// Tokenize a split sequence string to a list of Ids and add them to the accumulatedIds list.
+        /// Encode a split text string to a list of Ids and add them to the accumulatedIds list.
         /// </summary>
-        /// <param name="sequence">The sequence to split.</param>
+        /// <param name="text">The text to split.</param>
         /// <param name="isSpecialToken">Indicate if the token is a special token.</param>
-        /// <param name="accumulatedIds">The list of accumulated tokenized Ids.</param>
+        /// <param name="accumulatedIds">The list of accumulated encoded Ids.</param>
         /// <remarks>
-        /// This method does the default implementation that uses the Tokenize method to get the token's Ids.
+        /// This method does the default implementation that uses the Encode method to get the token's Ids.
         /// Tokenizer's models which care about performance may choose to override this method to provide a more efficient implementation.
         /// </remarks>
-        public virtual void TokenizeToIds(string sequence, bool isSpecialToken, IList<int> accumulatedIds)
+        public virtual void EncodeToIds(string text, bool isSpecialToken, IList<int> accumulatedIds)
         {
             if (accumulatedIds is null)
             {
                 throw new ArgumentNullException(nameof(accumulatedIds));
             }
 
-            var tokens = Tokenize(sequence);
+            var tokens = Encode(text);
             foreach (var token in tokens)
             {
                 accumulatedIds.Add(token.Id);
             }
         }
 
         /// <summary>
-        /// Get the number of tokens that the input sequence will be encoded to.
+        /// Get the number of tokens that the input text will be encoded to.
         /// </summary>
-        /// <param name="sequence">The text to tokenize.</param>
+        /// <param name="text">The text to encode.</param>
         /// <param name="isSpecialToken">Indicate if the token is special token.</param>
-        /// <returns>The number of tokens that the input sequence will be encoded to.</returns>
+        /// <returns>The number of tokens that the input text will be encoded to.</returns>
         /// <remarks>
-        /// This method does the default implementation that uses the TokenizeToIds method to get the number of token's Ids.
+        /// This method does the default implementation that uses the EncodeToIds method to get the number of token's Ids.
         /// Tokenizer's models which care about performance may choose to override this method to provide a more efficient implementation.
         /// </remarks>
-        public virtual int CountTokens(string sequence, bool isSpecialToken)
+        public virtual int CountTokens(string text, bool isSpecialToken)
         {
             var ids = new List<int>();
-            TokenizeToIds(sequence, isSpecialToken, ids);
+            EncodeToIds(text, isSpecialToken, ids);
             return ids.Count;
         }
 
         /// <summary>
-        /// Map the token to tokenized Id.
-        /// </summary>
-        /// <param name="token">The token to map to the Id.</param>
-        /// <returns>The mapped Id of the token.</returns>
-        public abstract int? TokenToId(string token);
-
-        /// <summary>
-        /// Map the token to tokenized id with the option to skip the special tokens.
+        /// Map the token to encoded id with the option to skip the special tokens.
         /// </summary>
         /// <param name="token">The token to map to Id</param>
         /// <param name="skipSpecialTokens">Indicate if want to skip the special tokens during the encoding.</param>
         /// <returns>The mapped Id of the token.</returns>
-        public virtual int? TokenToId(string token, bool skipSpecialTokens) => TokenToId(token);
+        public abstract int? TokenToId(string token, bool skipSpecialTokens = false);
 
         /// <summary>
-        /// Map the tokenized Id to the token.
+        /// Map the encoded Id to the token.
         /// </summary>
         /// <param name="id">The Id to map to the token.</param>
         /// <param name="skipSpecialTokens">Indicate if want to skip the special tokens during the decoding.</param>
+        /// <param name="filterUnsupportedChars">Indicate if want to filter the unsupported characters during the decoding.</param>
         /// <returns>The mapped token of the Id.</returns>
-        public abstract string? IdToToken(int id, bool skipSpecialTokens = false);
+        public abstract string? IdToToken(int id, bool skipSpecialTokens = false, bool filterUnsupportedChars = true);
 
         /// <summary>
-        /// Gets the dictionary mapping tokens to Ids.
+        /// Decode the given ids, back to a String.
         /// </summary>
-        public abstract IReadOnlyDictionary<string, int> GetVocab();
+        /// <param name="ids">The list of ids that we want to decode.</param>
+        /// <param name="skipSpecialTokens">Whether the special tokens should be removed from the decoded string.</param>
+        /// <param name="filterUnsupportedChars">Indicate if want to filter the unsupported characters during the decoding.</param>
+        /// <param name="decoder">The optional Decoder to merge the given list of tokens in a string.</param>
+        /// <returns>The decoded string.</returns>
+        public virtual string? Decode(IEnumerable<int> ids, TokenizerDecoder? decoder = null, bool skipSpecialTokens = false, bool filterUnsupportedChars = true)
+        {
+            List<string> tokens = new List<string>();
 
-        /// <summary>
-        /// Gets the dictionary size that map tokens to Ids.
-        /// </summary>
-        public abstract int GetVocabSize();
+            foreach (int id in ids)
+            {
+                tokens.Add(IdToToken(id, skipSpecialTokens, filterUnsupportedChars) ?? "");
+            }
+
+            return decoder?.Decode(tokens) ?? string.Join("", tokens);
+        }
 
         /// <summary>
-        /// Save the model data into the vocabulary and merges files.
+        /// Gets the dictionary mapping tokens to Ids.
         /// </summary>
-        /// <param name="path">The file system path to store the generated files at.</param>
-        /// <param name="prefix">Optional prefix for the generated file names.</param>
-        /// <returns>The list of all saved files.</returns>
-        public abstract string[] Save(string path, string? prefix = null);
+        public abstract IReadOnlyDictionary<string, int> GetVocab();
 
         /// <summary>
-        /// Gets a trainer object to use in training the model.
+        /// Gets the dictionary size that map tokens to Ids.
         /// </summary>
-        public abstract Trainer? GetTrainer();
+        public abstract int GetVocabSize();
     }
 }
diff --git a/src/Microsoft.ML.Tokenizers/Model/Progress.cs b/src/Microsoft.ML.Tokenizers/Model/Progress.cs