@@ -1776,6 +1776,38 @@ def set_vocab(self):
17761776 scores [token_id ] = - 1000.0
17771777 toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
17781778
1779+ tokenizer_config_file = self .dir_model / 'tokenizer_config.json'
1780+ if tokenizer_config_file .is_file ():
1781+ with open (tokenizer_config_file , "r" , encoding = "utf-8" ) as f :
1782+ tokenizer_config_json = json .load (f )
1783+ added_tokens_decoder = tokenizer_config_json .get ("added_tokens_decoder" , {})
1784+ for token_id , foken_data in added_tokens_decoder .items ():
1785+ token_id = int (token_id )
1786+ token = foken_data ["content" ].encode ("utf-8" )
1787+ if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
1788+ assert (tokens [token_id ] == token )
1789+ tokens [token_id ] = token
1790+ scores [token_id ] = - 1000.0
1791+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
1792+ if foken_data .get ("special" ):
1793+ toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
1794+
1795+ tokenizer_file = self .dir_model / 'tokenizer.json'
1796+ if tokenizer_file .is_file ():
1797+ with open (tokenizer_file , "r" , encoding = "utf-8" ) as f :
1798+ tokenizer_json = json .load (f )
1799+ added_tokens = tokenizer_json .get ("added_tokens" , [])
1800+ for foken_data in added_tokens :
1801+ token_id = int (foken_data ["id" ])
1802+ token = foken_data ["content" ].encode ("utf-8" )
1803+ if toktypes [token_id ] != SentencePieceTokenTypes .UNKNOWN :
1804+ assert (tokens [token_id ] == token )
1805+ tokens [token_id ] = token
1806+ scores [token_id ] = - 1000.0
1807+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
1808+ if foken_data .get ("special" ):
1809+ toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
1810+
17791811 self .gguf_writer .add_tokenizer_model ("llama" )
17801812 self .gguf_writer .add_tokenizer_pre ("default" )
17811813 self .gguf_writer .add_token_list (tokens )
0 commit comments