huggingface · Cyrilvallez · Dec 10, 2024 · Dec 10, 2024 · Dec 10, 2024 · Dec 10, 2024
diff --git a/server/text_generation_server/layers/gptq/quantize.py b/server/text_generation_server/layers/gptq/quantize.py
@@ -956,15 +956,24 @@ def _unload():
 
     pack(model, quantizers, bits, groupsize)
     from safetensors.torch import save_file
-    from transformers.modeling_utils import shard_checkpoint
+    from huggingface_hub import split_torch_state_dict_into_shards
 
     state_dict = model.state_dict()
     state_dict = {k: v.cpu().contiguous() for k, v in state_dict.items()}
 
     max_shard_size = "10GB"
-    shards, index = shard_checkpoint(
-        state_dict, max_shard_size=max_shard_size, weights_name="model.safetensors"
+    state_dict_split = split_torch_state_dict_into_shards(
+        state_dict,
+        filename_pattern="model.safetensors",
+        max_shard_size=max_shard_size,
     )
+    index = None
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+    shards = state_dict_split.filename_to_tensors
     os.makedirs(output_dir, exist_ok=True)
     for shard_file, shard in shards.items():
         save_file(

diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
@@ -16,10 +16,14 @@
 from huggingface_hub import hf_hub_download, HfApi
 from typing import Optional, List, Dict
 from pathlib import Path
+import transformers
 
 from text_generation_server.utils.speculate import get_speculate, set_speculate
 from text_generation_server.models.model import Model
 from text_generation_server.models.causal_lm import CausalLM, CausalLMBatchKeysLast
+from text_generation_server.models.transformers_flash_causal_lm import (
+    TransformersFlashCausalLM,
+)
 from text_generation_server.models.custom_modeling.opt_modeling import OPTForCausalLM
 from text_generation_server.models.custom_modeling.mpt_modeling import (
     MPTForCausalLM,
@@ -372,6 +376,23 @@ def get_model(
     )
     model_type = config_dict.get("model_type", None)
 
+    transformers_causal_lm_class = CausalLM
+
+    # Fast transformers path
+    transformers_model_class = getattr(
+        transformers, modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[model_type]
+    )
+    if transformers_model_class._supports_flex_attn:
+        transformers_causal_lm_class = TransformersFlashCausalLM
+        if (
+            not FLASH_ATTENTION
+            and lora_adapter_ids is not None
+            and len(lora_adapter_ids) > 0
+        ):
+            raise ValueError(
+                "Transformers backend AutoModel do not support `lora_adapter_ids`."
+            )
+
     quantization_config = config_dict.get("quantization_config", None)
     if quantization_config is None:
         quantization_config = config_dict.get("compression_config", None)
@@ -615,7 +636,7 @@ def get_model(
                 FLASH_ATT_ERROR_MESSAGE.format("Sharded Deepseek V2")
             )
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -674,7 +695,7 @@ def get_model(
                 FLASH_ATT_ERROR_MESSAGE.format("Sharded Santacoder")
             )
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id=model_id,
                 revision=revision,
                 quantize=quantize,
@@ -722,7 +743,7 @@ def get_model(
             except RuntimeError as e:
                 # Lots of legacy models with various weight names.
                 log_master(logger.warning, f"Couldn't load flash gpt2 variant: {e}")
-                return CausalLM.fallback(
+                return transformers_causal_lm_class.fallback(
                     model_id,
                     revision,
                     quantize=quantize,
@@ -733,7 +754,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded GPT-2"))
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -758,7 +779,7 @@ def get_model(
             except RuntimeError as e:
                 # Lots of legacy models with various weight names.
                 log_master(logger.warning, f"Couldn't load flash gptj variant: {e}")
-                return CausalLM.fallback(
+                return transformers_causal_lm_class.fallback(
                     model_id,
                     revision,
                     quantize=quantize,
@@ -769,7 +790,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded GPT-J"))
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -806,7 +827,7 @@ def get_model(
                 trust_remote_code=trust_remote_code,
             )
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -829,7 +850,7 @@ def get_model(
                 lora_adapter_ids=lora_adapter_ids,
             )
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -853,7 +874,7 @@ def get_model(
                 lora_adapter_ids=lora_adapter_ids,
             )
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -902,7 +923,7 @@ def get_model(
                 FLASH_ATT_ERROR_MESSAGE.format(f"Sharded {model_type}")
             )
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -928,7 +949,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma"))
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -954,7 +975,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma2"))
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -979,7 +1000,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Cohere"))
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -1007,7 +1028,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded DBRX"))
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -1057,7 +1078,7 @@ def get_model(
                     config_class=RWConfig,
                 )
             else:
-                return CausalLM.fallback(
+                return transformers_causal_lm_class.fallback(
                     model_id,
                     revision,
                     quantize=quantize,
@@ -1082,7 +1103,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Mistral"))
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -1107,7 +1128,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Mixtral"))
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -1134,7 +1155,7 @@ def get_model(
                 FLASH_ATT_ERROR_MESSAGE.format("Sharded Starcoder2")
             )
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -1159,7 +1180,7 @@ def get_model(
         elif sharded:
             raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Qwen2"))
         else:
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,
@@ -1302,7 +1323,7 @@ def get_model(
     elif quantize == "exl2":
         raise NotImplementedError("exl2 quantization is not supported for AutoModel")
     if model_type in modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
-        return CausalLM.fallback(
+        return transformers_causal_lm_class.fallback(
             model_id,
             revision,
             quantize=quantize,
@@ -1323,7 +1344,7 @@ def get_model(
     auto_map = config_dict.get("auto_map", None)
     if trust_remote_code and auto_map is not None:
         if "AutoModelForCausalLM" in auto_map.keys():
-            return CausalLM.fallback(
+            return transformers_causal_lm_class.fallback(
                 model_id,
                 revision,
                 quantize=quantize,