[Bugfix] Fix incorrect vocal embedding shards for GGUF model in tenso…

…r parallelism (vllm-project#7954)
aibrix · Sep 19, 2024 · 5e2ccd2 · 5e2ccd2
1 parent 823660c
commit 5e2ccd2
Showing 1 changed file with 4 additions and 1 deletion.
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -351,7 +351,10 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             param.weight_type = loaded_weight.item()
             return
         elif isinstance(param, UninitializedParameter):
-            param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype)
+            shape = list(loaded_weight.shape)
+            if output_dim is not None:
+                shape[output_dim] = shape[output_dim] // self.tp_size
+            param.materialize(tuple(shape), dtype=loaded_weight.dtype)
 
         # If parameter does not have output dim, then it should
         # be copied onto all gpus (e.g. g_idx for act_order gptq).