aibrix
Showing with 12 additions and 5 deletions.

+8 −2 vllm/attention/backends/xformers.py

+4 −3 vllm/engine/llm_engine.py
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
@@ -612,7 +612,11 @@ def forward(
                         PagedAttention.write_to_paged_cache(key, value, key_cache,
                                                             value_cache,
                                                             attn_metadata.slot_mapping,
-                                                            self.kv_cache_dtype, kv_scale)
+                                                            self.kv_cache_dtype,
+                                                            #kv_scale
+                                                            k_scale,
+                                                            v_scale
+                                                            )
                         # torch.set_printoptions(precision=2, sci_mode=False)
                         # print(f"key.shape:{key.shape}, key_cache.shape:{key_cache.shape}")
                         # print(f"key.shape:{key[:2,:1,].shape}\n{key[:2,:1,]}")
@@ -748,7 +752,9 @@ def forward(
                     self.num_kv_heads,
                     self.scale,
                     self.alibi_slopes,
-                    kv_scale,
+                    #kv_scale,
+                    k_scale,
+                    v_scale
                 )
 
                 # print(f"original, output.shape:{output[:,:1,]}", file=sys.stderr)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -461,9 +461,10 @@ def _initialize_kv_caches(self) -> None:
         The workers will determine the number of blocks in both the GPU cache
         and the swap CPU cache.
         """
-        num_gpu_blocks, num_cpu_blocks = (
-            self.model_executor.determine_num_available_blocks())
-
+        #num_gpu_blocks, num_cpu_blocks = (
+        #    self.model_executor.determine_num_available_blocks())
+        num_gpu_blocks = 1024
+        num_cpu_blocks=0
         if self.cache_config.num_gpu_blocks_override is not None:
             num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override
             logger.info(