From c9c9180ef66d9dca092be899b3f33c4cab708739 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Thu, 31 Oct 2024 15:41:50 -0700 Subject: [PATCH 1/3] Add the governance file cgmanifest.json for tokenizer's vocab files --- THIRD-PARTY-NOTICES.TXT | 4 +- cgmanifest.json | 49 +++++++++++++++++++ .../Microsoft.ML.Tokenizers.Data.Gpt2.csproj | 3 +- .../Model/TiktokenTokenizer.cs | 2 +- .../TokenizerDataTests.cs | 2 +- .../TiktokenTests.cs | 2 +- 6 files changed, 56 insertions(+), 6 deletions(-) create mode 100644 cgmanifest.json diff --git a/THIRD-PARTY-NOTICES.TXT b/THIRD-PARTY-NOTICES.TXT index 52364713d4..3bc1463084 100644 --- a/THIRD-PARTY-NOTICES.TXT +++ b/THIRD-PARTY-NOTICES.TXT @@ -86,8 +86,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -License notice for OpenAI Tiktoken Tokenizer --------------------------------------------- +License notice for OpenAI Tiktoken Tokenizer & Tokenizer's vocab files +---------------------------------------------------------------------- https://github.com/openai/tiktoken/blob/main/LICENSE diff --git a/cgmanifest.json b/cgmanifest.json new file mode 100644 index 0000000000..4d2c818bd6 --- /dev/null +++ b/cgmanifest.json @@ -0,0 +1,49 @@ +{ + "$schema": "https://json.schemastore.org/component-detection-manifest.json", + "Registrations": [ + { + "Component": { + "Type": "other", + "other": { + "name": "cl100k_base.tiktoken", + "downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken", + "hash": "sha1:6494e42d5aad2bbb441ea9793af9e7db335c8d9c" + } + }, + "DevelopmentDependency": false + }, + { + "Component": { + "Type": "other", + "other": { + "name": "o200k_base.tiktoken", + "downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken", + "hash": "sha1:1d4fdeb17c52829ead47ac65e61197fd530b1c31" + } + }, + "DevelopmentDependency": false + }, + { + "Component": { + "Type": "other", + "other": { + "name": "p50k_base.tiktoken", + "downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken", + "hash": "sha1:0ecf4ae6d454e7719bcf35f284eac0b73f37e3c9" + } + }, + "DevelopmentDependency": false + }, + { + "Component": { + "Type": "other", + "other": { + "name": "r50k_base.tiktoken", + "downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken", + "hash": "sha1:5674ba48e48e76284eb747c896a291dc5583c808" + } + }, + "DevelopmentDependency": false + } + ] +} \ No newline at end of file diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj index 15799111ee..66c89a06c1 100644 --- a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj +++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj @@ -11,10 +11,11 @@