Executable code

ZhijunLStudio · ZhijunLStudio · commit 891544c94261 · 2025-10-31T16:21:50.000+08:00
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
@@ -219,6 +219,23 @@ def __init__(
 
         if not hasattr(self, "head_dim"):
             self.head_dim = self.hidden_size // self.num_attention_heads
+        
+        rotary_dim = getattr(self, "rotary_dim", None)
+        head_dim = getattr(self, "head_dim", None)
+
+        if (rotary_dim is not None and 
+            head_dim is not None and
+            rotary_dim < head_dim):
+
+            # The calculation and overriding are only performed when partial_rotary_factor is still the default value of 1.0.
+            if getattr(self, "partial_rotary_factor", 1.0) == 1.0:
+                self.partial_rotary_factor = rotary_dim / head_dim
+                logger.info(f"Partial rotation detected via 'rotary_dim'. "
+                            f"Calculated and set 'partial_rotary_factor' to: {self.partial_rotary_factor:.4f}")
+
+        current_partial_factor = getattr(self, "partial_rotary_factor", 1.0)
+        if current_partial_factor < 1.0 and head_dim is not None:
+             self.rotary_dim = int(head_dim * current_partial_factor)
 
         if hasattr(self, "vision_config"):
             self.vision_config = PretrainedConfig.from_dict(self.vision_config)
@@ -227,12 +244,6 @@ def __init__(
         self.think_end_id = args.get("think_end_id", -1)
         self.im_patch_id = args.get("image_patch_id", -1)
         self.line_break_id = args.get("line_break_id", -1)
-        
-        if (hasattr(self, "rotary_dim") and
-            hasattr(self, "head_dim") and
-            self.rotary_dim < self.head_dim):
-            self.partial_rotary_factor = self.rotary_dim / self.head_dim
-            logger.info(f"Partial rotation detected. Calculated partial_rotary_factor: {self.partial_rotary_factor}")
 
         self._post_init()
 
diff --git a/fastdeploy/demo/offline_demo.py b/fastdeploy/demo/offline_demo.py
@@ -17,11 +17,11 @@
 from fastdeploy.engine.sampling_params import SamplingParams
 from fastdeploy.entrypoints.llm import LLM
 
-model_name_or_path = "./models/llama-7b"
+model_name_or_path = "/home/aistudio/config_folder"
 
 # 超参设置
 sampling_params = SamplingParams(temperature=0.1, max_tokens=30)
-llm = LLM(model=model_name_or_path, tensor_parallel_size=1)
+llm = LLM(model=model_name_or_path, tensor_parallel_size=4, load_choices="default_v1")
 output = llm.generate(prompts="who are you？", use_tqdm=True, sampling_params=sampling_params)
 
 print(output)
diff --git a/fastdeploy/model_executor/models/minimax_m1.py b/fastdeploy/model_executor/models/minimax_m1.py
@@ -477,15 +477,46 @@ def forward(self, forward_meta: ForwardMeta, hidden_states: paddle.Tensor, resid
         # GQA
         if self.attn_type == 1:  
             qkv_out = self.qkv_proj(layernorm_output)
+            # print_tensor_stats(qkv_out, f"FD_L{layer_id}:1b_After_QKV_Proj_Combined")
+            logger.info(f"--- [FD DEBUG] PRE-ATTENTION DUMP FOR LAYER {layer_id} ---")
+            print_tensor_stats(hidden_states, f"FD_L{layer_id}:0_HiddenStates_Input")
+            print_tensor_stats(layernorm_output, f"FD_L{layer_id}:1a_After_InputLayernorm")
             print_tensor_stats(qkv_out, f"FD_L{layer_id}:1b_After_QKV_Proj_Combined")
 
+
             q_size_tp = self.self_attn.num_heads * self.self_attn.head_dim
             k_size_tp = self.self_attn.kv_num_heads * self.self_attn.head_dim
 
             q_before_rope, k_before_rope, v_tensor = qkv_out.split([q_size_tp, k_size_tp, k_size_tp], axis=-1)
             print_tensor_stats(q_before_rope, f"FD_L{layer_id}:1c_Q_BeforeRoPE")
             print_tensor_stats(k_before_rope, f"FD_L{layer_id}:1d_K_BeforeRoPE")
             print_tensor_stats(v_tensor,      f"FD_L{layer_id}:1e_V_Tensor")
+            logger.info(f"--- [FD DEBUG] ForwardMeta DUMP FOR LAYER {layer_id} ---")
+            # 1. RoPE 缓存 (最关键的)
+            # 我们需要知道它的形状，以确认是否正确生成
+            if forward_meta.rotary_embs is not None:
+                logger.info("--- [FD DEBUG] forward_meta.rotary_embs ---")
+                print_tensor_stats(forward_meta.rotary_embs, f"FD_L{layer_id}:meta_rotary_embs")
+                # 预期形状: [2, bsz, max_seq_len, 1, rotary_dim] or [2, bsz, max_seq_len, 1, rotary_dim/2]
+                # 对于 MiniMax-M1 (NEOX风格), 应该是 [2, 1, max_len, 1, 64]
+            else:
+                logger.info("--- [FD DEBUG] forward_meta.rotary_embs is None ---")
+
+            # 2. 序列长度信息
+            print_tensor_stats(forward_meta.seq_lens_encoder, f"FD_L{layer_id}:meta_seq_lens_encoder")
+            print_tensor_stats(forward_meta.seq_lens_decoder, f"FD_L{layer_id}:meta_seq_lens_decoder")
+            print_tensor_stats(forward_meta.seq_lens_this_time, f"FD_L{layer_id}:meta_seq_lens_this_time")
+
+            # 3. Padding 和索引信息
+            print_tensor_stats(forward_meta.ids_remove_padding, f"FD_L{layer_id}:meta_ids_remove_padding")
+            print_tensor_stats(forward_meta.batch_id_per_token, f"FD_L{layer_id}:meta_batch_id_per_token")
+            print_tensor_stats(forward_meta.cu_seqlens_q, f"FD_L{layer_id}:meta_cu_seqlens_q")
+            print_tensor_stats(forward_meta.cu_seqlens_k, f"FD_L{layer_id}:meta_cu_seqlens_k")
+
+            # 4. KV Cache 相关信息
+            print_tensor_stats(forward_meta.block_tables, f"FD_L{layer_id}:meta_block_tables")
+            logger.info(f"--- [FD DEBUG] END OF DUMP FOR LAYER {layer_id} ---\n")
+            # --- 日志打印结束 ---
             
             
             attn_output = self.self_attn(qkv=qkv_out, forward_meta=forward_meta)
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
@@ -83,7 +83,7 @@
 from fastdeploy import envs
 from fastdeploy.engine.pooling_params import PoolingParams
 from fastdeploy.engine.tasks import PoolingTask
-from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
+# from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
 from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient
 from fastdeploy.model_executor.forward_meta import ForwardMeta
 from fastdeploy.model_executor.layers.pool.metadata import PoolingMetadata
@@ -117,8 +117,8 @@ def __init__(
 
         # VL model config:
         if self.enable_mm:
-            if "ernie" in self.fd_config.model_config.model_type:
-                self._init_image_preprocess()
+            # if "ernie" in self.fd_config.model_config.model_type:
+            #     self._init_image_preprocess()
 
             self.amp_black = [
                 "reduce_sum",
@@ -1119,6 +1119,11 @@ def _init_share_inputs(self, max_num_seqs: int):
 
         # Initialize rotary position embedding
         if not self.enable_mm:
+
+            logger.info(f"Final rotary_dim from config: {self.model_config.rotary_dim}")
+            logger.info(f"Original head_dim from config: {self.model_config.head_dim}")
+            logger.info(f"Calculated partial_rotary_factor from config: {self.model_config.partial_rotary_factor}")
+            
             self.share_inputs["rope_emb"] = get_rope(
                 rotary_dim=self.model_config.head_dim,
                 position_ids=paddle.arange(self.model_config.max_model_len).reshape((1, -1)),
@@ -1128,7 +1133,6 @@ def _init_share_inputs(self, max_num_seqs: int):
             )
 
 
-
         # Set block tables
         pre_max_block_num = (
             self.model_config.max_model_len + self.cache_config.block_size - 1
@@ -2423,27 +2427,27 @@ def padding_cudagraph_inputs(self) -> None:
             self.real_token_num = self.forward_meta.ids_remove_padding.shape[0]
         return
 
-    def _init_image_preprocess(self) -> None:
-        processor = DataProcessor(
-            tokenizer_name=self.model_config.model,
-            image_preprocessor_name=str(self.model_config.model),
-        )
-        processor.eval()
-        image_preprocess = processor.image_preprocessor
-        image_preprocess.image_mean_tensor = paddle.to_tensor(image_preprocess.image_mean, dtype="float32").reshape(
-            [1, 3, 1, 1]
-        )
-        image_preprocess.image_std_tensor = paddle.to_tensor(image_preprocess.image_std, dtype="float32").reshape(
-            [1, 3, 1, 1]
-        )
-        image_preprocess.rescale_factor = paddle.to_tensor(image_preprocess.rescale_factor, dtype="float32")
-        image_preprocess.image_mean_tensor = image_preprocess.image_mean_tensor.squeeze([-2, -1]).repeat_interleave(
-            self.model_config.vision_config.patch_size**2 * 1, -1
-        )
-        image_preprocess.image_std_tensor = image_preprocess.image_std_tensor.squeeze([-2, -1]).repeat_interleave(
-            self.model_config.vision_config.patch_size**2 * 1, -1
-        )
-        self.image_preprocess = image_preprocess
+    # def _init_image_preprocess(self) -> None:
+    #     processor = DataProcessor(
+    #         tokenizer_name=self.model_config.model,
+    #         image_preprocessor_name=str(self.model_config.model),
+    #     )
+    #     processor.eval()
+    #     image_preprocess = processor.image_preprocessor
+    #     image_preprocess.image_mean_tensor = paddle.to_tensor(image_preprocess.image_mean, dtype="float32").reshape(
+    #         [1, 3, 1, 1]
+    #     )
+    #     image_preprocess.image_std_tensor = paddle.to_tensor(image_preprocess.image_std, dtype="float32").reshape(
+    #         [1, 3, 1, 1]
+    #     )
+    #     image_preprocess.rescale_factor = paddle.to_tensor(image_preprocess.rescale_factor, dtype="float32")
+    #     image_preprocess.image_mean_tensor = image_preprocess.image_mean_tensor.squeeze([-2, -1]).repeat_interleave(
+    #         self.model_config.vision_config.patch_size**2 * 1, -1
+    #     )
+    #     image_preprocess.image_std_tensor = image_preprocess.image_std_tensor.squeeze([-2, -1]).repeat_interleave(
+    #         self.model_config.vision_config.patch_size**2 * 1, -1
+    #     )
+    #     self.image_preprocess = image_preprocess
 
     def _preprocess_mm_task(self, one: dict) -> None:
         """process batch"""