casper-hansen · casper-hansen · Mar 6, 2025 · Feb 5, 2025 · Feb 5, 2025 · Feb 6, 2025
diff --git a/awq/models/__init__.py b/awq/models/__init__.py
@@ -29,3 +29,4 @@
 from .internlm2 import InternLM2AWQForCausalLM
 from .minicpm3 import MiniCPM3AWQForCausalLM
 from .qwen2vl import Qwen2VLAWQForCausalLM
+from .qwen2_5_vl import Qwen2_5_VLAWQForCausalLM
diff --git a/awq/models/auto.py b/awq/models/auto.py
@@ -40,6 +40,7 @@
     "internlm2": InternLM2AWQForCausalLM,
     "minicpm3": MiniCPM3AWQForCausalLM,
     "qwen2_vl": Qwen2VLAWQForCausalLM,
+    "qwen2_5_vl": Qwen2_5_VLAWQForCausalLM,
 }
 
 

diff --git a/awq/models/base.py b/awq/models/base.py
@@ -84,9 +84,10 @@
     "deepseek_v2": "AutoModelForCausalLM",
     "deepseek_v3": "AutoModelForCausalLM",
     "minicpm": "AutoModelForCausalLM",
-    "minicpm3":"AutoModelForCausalLM",
+    "minicpm3": "AutoModelForCausalLM",
     "internlm2": "AutoModelForCausalLM",
     "qwen2_vl": "AutoModelForVision2Seq",
+    "qwen2_5_vl": "AutoModelForVision2Seq",
 }
 
 

diff --git a/awq/models/qwen2_5_vl.py b/awq/models/qwen2_5_vl.py
@@ -0,0 +1,81 @@
+from .base import BaseAWQForCausalLM
+from typing_extensions import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from transformers import Qwen2_5_VLForConditionalGeneration
+    from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+        Qwen2_5_VLDecoderLayer,
+    )
+
+
+class Qwen2_5_VLAWQForCausalLM(BaseAWQForCausalLM):
+    layer_type = "Qwen2_5_VLDecoderLayer"
+    max_seq_len_key = "max_position_embeddings"
+    modules_to_not_convert = ["visual"]
+
+    @staticmethod
+    def get_model_layers(model: "Qwen2_5_VLForConditionalGeneration"):
+        return model.model.layers
+
+    @staticmethod
+    def get_act_for_scaling(module: "Qwen2_5_VLForConditionalGeneration"):
+        return dict(is_scalable=False)
+
+    @staticmethod
+    def move_embed(model: "Qwen2_5_VLForConditionalGeneration", device: str):
+        model.model.embed_tokens = model.model.embed_tokens.to(device)
+        model.visual = model.visual.to(device)
+        model.model.rotary_emb = model.model.rotary_emb.to(device)
+
+    @staticmethod
+    def get_layers_for_scaling(
+        module: "Qwen2_5_VLDecoderLayer", input_feat, module_kwargs
+    ):
+        layers = []
+
+        # attention input
+        layers.append(
+            dict(
+                prev_op=module.input_layernorm,
+                layers=[
+                    module.self_attn.q_proj,
+                    module.self_attn.k_proj,
+                    module.self_attn.v_proj,
+                ],
+                inp=input_feat["self_attn.q_proj"],
+                module2inspect=module.self_attn,
+                kwargs=module_kwargs,
+            )
+        )
+
+        # attention out
+        # Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696
+        if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:
+            layers.append(
+                dict(
+                    prev_op=module.self_attn.v_proj,
+                    layers=[module.self_attn.o_proj],
+                    inp=input_feat["self_attn.o_proj"],
+                )
+            )
+
+        # linear 1
+        layers.append(
+            dict(
+                prev_op=module.post_attention_layernorm,
+                layers=[module.mlp.gate_proj, module.mlp.up_proj],
+                inp=input_feat["mlp.gate_proj"],
+                module2inspect=module.mlp,
+            )
+        )
+
+        # linear 2
+        layers.append(
+            dict(
+                prev_op=module.mlp.up_proj,
+                layers=[module.mlp.down_proj],
+                inp=input_feat["mlp.down_proj"],
+            )
+        )
+
+        return layers
diff --git a/awq/quantize/quantizer.py b/awq/quantize/quantizer.py
@@ -1,3 +1,4 @@
+import transformers
 import torch
 import inspect
 import logging
@@ -153,6 +154,15 @@ def quantize(self):
             # https://github.com/huggingface/transformers/pull/32617
             self.awq_model.move_embed(self.model, common_device)
 
+            # Transformers >= 4.48.0 requires positional embeddings should be computed before forward pass
+            if (
+                transformers.__version__ >= "4.48.0"
+                and self.module_kwargs.get("position_embeddings") is None
+            ):
+                self.module_kwargs["position_embeddings"] = self.model.model.rotary_emb(
+                    self.inps, self.module_kwargs["position_ids"]
+                )
+
             for k, v in self.module_kwargs.items():
                 # position embeddings found in tuple
                 if isinstance(v, tuple):