11## About
22
3- Microsoft.ML.Tokenizers supports various the implementation of the tokenization used in the NLP transforms .
3+ Microsoft.ML.Tokenizers provides an abstraction for tokenizers as well as implementations of common tokenization algorithms .
44
55## Key Features
66
@@ -15,28 +15,28 @@ Microsoft.ML.Tokenizers supports various the implementation of the tokenization
1515
1616``` c#
1717using Microsoft .ML .Tokenizers ;
18- using System .Net .Http ;
1918using System .IO ;
19+ using System .Net .Http ;
2020
2121//
2222// Using Tiktoken Tokenizer
2323//
2424
25- // initialize the tokenizer for `gpt-4 ` model
26- Tokenizer tokenizer = TiktokenTokenizer .CreateForModel (" gpt-4 " );
25+ // Initialize the tokenizer for the `gpt-4o ` model. This instance should be cached for all subsequent use.
26+ Tokenizer tokenizer = TiktokenTokenizer .CreateForModel (" gpt-4o " );
2727
2828string source = " Text tokenization is the process of splitting a string into a list of tokens." ;
2929
3030Console .WriteLine ($" Tokens: {tokenizer .CountTokens (source )}" );
31- // print : Tokens: 16
31+ // prints : Tokens: 16
3232
3333var trimIndex = tokenizer .GetIndexByTokenCountFromEnd (source , 5 , out string processedText , out _ );
3434Console .WriteLine ($" 5 tokens from end: {processedText .Substring (trimIndex )}" );
35- // 5 tokens from end: a list of tokens.
35+ // prints: 5 tokens from end: a list of tokens.
3636
3737trimIndex = tokenizer .GetIndexByTokenCount (source , 5 , out processedText , out _ );
3838Console .WriteLine ($" 5 tokens from start: {processedText .Substring (0 , trimIndex )}" );
39- // 5 tokens from start: Text tokenization is the
39+ // prints: 5 tokens from start: Text tokenization is the
4040
4141IReadOnlyList < int > ids = tokenizer .EncodeToIds (source );
4242Console .WriteLine (string .Join (" , " , ids ));
@@ -46,20 +46,21 @@ Console.WriteLine(string.Join(", ", ids));
4646// Using Llama Tokenizer
4747//
4848
49- // Open stream of remote Llama tokenizer model data file
49+ // Open a stream to the remote Llama tokenizer model data file.
5050using HttpClient httpClient = new ();
5151const string modelUrl = @" https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model" ;
5252using Stream remoteStream = await httpClient .GetStreamAsync (modelUrl );
5353
54- // Create the Llama tokenizer using the remote stream
54+ // Create the Llama tokenizer using the remote stream. This should be cached for all subsequent use.
5555Tokenizer llamaTokenizer = LlamaTokenizer .Create (remoteStream );
56+
5657string input = " Hello, world!" ;
5758ids = llamaTokenizer .EncodeToIds (input );
5859Console .WriteLine (string .Join (" , " , ids ));
5960// prints: 1, 15043, 29892, 3186, 29991
6061
6162Console .WriteLine ($" Tokens: {llamaTokenizer .CountTokens (input )}" );
62- // print : Tokens: 5
63+ // prints : Tokens: 5
6364```
6465
6566## Main Types
0 commit comments