File tree Expand file tree Collapse file tree 1 file changed +24
-2
lines changed Expand file tree Collapse file tree 1 file changed +24
-2
lines changed Original file line number Diff line number Diff line change @@ -122,8 +122,30 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
122122                tokenizer  =  json .load (f )
123123            if  self .load_merges :
124124                merges  =  tokenizer .get ('model' , {}).get ('merges' )
125-                 if  isinstance (merges , list ) and  merges  and  isinstance (merges [0 ], str ):
126-                     self .merges  =  merges 
125+                 if  isinstance (merges , list ) and  merges :
126+                     if  isinstance (merges [0 ], str ):
127+                         self .merges  =  merges 
128+                     elif  isinstance (merges [0 ], list ) and  len (merges [0 ]) ==  2  and  isinstance (merges [0 ][0 ], str ):
129+                         # New format since transformers 4.45 to support spaces in merges 
130+                         # ref: https://github.com/ggerganov/llama.cpp/issues/9692 
131+                         # TODO: internally store as the new format instead of converting to old 
132+                         if  any (' '  in  s  for  pair  in  merges  for  s  in  pair ):
133+                             logger .warning (f'Spaces in merges detected, encoding as { chr (ord (" " ) +  256 )!r}  )
134+                         self .merges  =  [
135+                             ' ' .join (
136+                                 [
137+                                     # ensure the spaces are properly encoded 
138+                                     '' .join (
139+                                         chr (ord (c ) +  256 ) if  c  ==  ' '  else  c 
140+                                         for  c  in  part 
141+                                     )
142+                                     for  part  in  pair 
143+                                 ]
144+                             )
145+                             for  pair  in  merges 
146+                         ]
147+                     else :
148+                         raise  ValueError ("Unknown tokenizer merges format" )
127149            added_tokens  =  tokenizer .get ('added_tokens' , {})
128150        else :
129151            added_tokens  =  {}
 
 
   
 
     
   
   
          
    
    
     
    
      
     
     
    You can’t perform that action at this time.
  
 
    
  
    
      
        
     
       
      
     
   
 
    
    
  
 
  
 
     
    
0 commit comments