Skip to content
Permalink

Comparing changes

This is a direct comparison between two commits made in this repository or its related repositories. View the default comparison for this range or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: aibrix/vllm
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: fdd7dc652843f65fa8c3a0b6877367f8a66f54d5
Choose a base ref
..
head repository: aibrix/vllm
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: 622042f6b5bc6779153588dd88458fe6a9913461
Choose a head ref
Showing with 12 additions and 5 deletions.
  1. +8 −2 vllm/attention/backends/xformers.py
  2. +4 −3 vllm/engine/llm_engine.py
10 changes: 8 additions & 2 deletions vllm/attention/backends/xformers.py
Original file line number Diff line number Diff line change
@@ -612,7 +612,11 @@ def forward(
PagedAttention.write_to_paged_cache(key, value, key_cache,
value_cache,
attn_metadata.slot_mapping,
self.kv_cache_dtype, kv_scale)
self.kv_cache_dtype,
#kv_scale
k_scale,
v_scale
)
# torch.set_printoptions(precision=2, sci_mode=False)
# print(f"key.shape:{key.shape}, key_cache.shape:{key_cache.shape}")
# print(f"key.shape:{key[:2,:1,].shape}\n{key[:2,:1,]}")
@@ -748,7 +752,9 @@ def forward(
self.num_kv_heads,
self.scale,
self.alibi_slopes,
kv_scale,
#kv_scale,
k_scale,
v_scale
)

# print(f"original, output.shape:{output[:,:1,]}", file=sys.stderr)
7 changes: 4 additions & 3 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
@@ -461,9 +461,10 @@ def _initialize_kv_caches(self) -> None:
The workers will determine the number of blocks in both the GPU cache
and the swap CPU cache.
"""
num_gpu_blocks, num_cpu_blocks = (
self.model_executor.determine_num_available_blocks())

#num_gpu_blocks, num_cpu_blocks = (
# self.model_executor.determine_num_available_blocks())
num_gpu_blocks = 1024
num_cpu_blocks=0
if self.cache_config.num_gpu_blocks_override is not None:
num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override
logger.info(