diff --git a/THIRD-PARTY-NOTICES.TXT b/THIRD-PARTY-NOTICES.TXT index 52364713d4..3bc1463084 100644 --- a/THIRD-PARTY-NOTICES.TXT +++ b/THIRD-PARTY-NOTICES.TXT @@ -86,8 +86,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -License notice for OpenAI Tiktoken Tokenizer --------------------------------------------- +License notice for OpenAI Tiktoken Tokenizer & Tokenizer's vocab files +---------------------------------------------------------------------- https://github.com/openai/tiktoken/blob/main/LICENSE diff --git a/cgmanifest.json b/cgmanifest.json new file mode 100644 index 0000000000..3dc90f550f --- /dev/null +++ b/cgmanifest.json @@ -0,0 +1,54 @@ +{ + "$schema": "https://json.schemastore.org/component-detection-manifest.json", + "version": 1, + "registrations": [ + { + "component": { + "type": "other", + "other": { + "name": "cl100k_base.tiktoken", + "version": "1", + "downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken", + "hash": "sha1:6494e42d5aad2bbb441ea9793af9e7db335c8d9c" + } + }, + "developmentDependency": false + }, + { + "component": { + "type": "other", + "other": { + "name": "o200k_base.tiktoken", + "version": "1", + "downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken", + "hash": "sha1:1d4fdeb17c52829ead47ac65e61197fd530b1c31" + } + }, + "developmentDependency": false + }, + { + "component": { + "type": "other", + "other": { + "name": "p50k_base.tiktoken", + "version": "1", + "downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken", + "hash": "sha1:0ecf4ae6d454e7719bcf35f284eac0b73f37e3c9" + } + }, + "developmentDependency": false + }, + { + "component": { + "type": "other", + "other": { + "name": "r50k_base.tiktoken", + "version": "1", + "downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken", + "hash": "sha1:5674ba48e48e76284eb747c896a291dc5583c808" + } + }, + "developmentDependency": false + } + ] +} \ No newline at end of file diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj index 15799111ee..66c89a06c1 100644 --- a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj +++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj @@ -11,10 +11,11 @@