moirai-internal
diff --git a/‎.buildkite/scripts/hardware_ci/run-cpu-test.sh‎
Lines changed: 4 additions & 4 deletions b/‎.buildkite/scripts/hardware_ci/run-cpu-test.sh‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎.buildkite/scripts/hardware_ci/run-xpu-test.sh‎
Lines changed: 2 additions & 0 deletions b/‎.buildkite/scripts/hardware_ci/run-xpu-test.sh‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/online_serving/chart-helm/values.yaml‎
Lines changed: 1 addition & 1 deletion b/‎examples/online_serving/chart-helm/values.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements/common.txt‎
Lines changed: 4 additions & 0 deletions b/‎requirements/common.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎tests/kernels/attention/test_attention_selector.py‎
Lines changed: 7 additions & 10 deletions b/‎tests/kernels/attention/test_attention_selector.py‎
Lines changed: 7 additions & 10 deletions
diff --git a/‎tests/test_utils.py‎
Lines changed: 3 additions & 0 deletions b/‎tests/test_utils.py‎
Lines changed: 3 additions & 0 deletions
@@ -73,10 +73,10 @@ function cpu_tests() {
 
   # Note: disable it until supports V1
   # Run AWQ test
-  # docker exec cpu-test-"$NUMA_NODE" bash -c "
-  #   set -e
-  #   VLLM_USE_V1=0 pytest -s -v \
-  #   tests/quantization/test_ipex_quant.py"
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    VLLM_USE_V1=0 pytest -s -v \
+    tests/quantization/test_ipex_quant.py"
 
   # online serving
   docker exec cpu-test-"$NUMA_NODE" bash -c '
 
@@ -26,6 +26,8 @@ docker run \
     --name "${container_name}" \
     "${image_name}" \
     sh -c '
+    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
+    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
 
@@ -8,7 +8,7 @@ image:
   # -- Image tag
   tag: "latest"
   # -- Container launch command
-  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--enforce-eager", "--dtype", "bfloat16", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
+  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "float32", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
 
 # -- Container port
 containerPort: 8000
 
@@ -7,7 +7,11 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
+<<<<<<< HEAD
 transformers >= 4.53.2
+=======
+transformers >= 4.51.1, <4.54.0
+>>>>>>> origin/features-based-on-v0.9.2
 huggingface-hub[hf_xet] >= 0.33.0  # Required for Xet downloads.
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 
@@ -36,8 +36,7 @@ def clear_cache():
 DEVICE_MLA_BLOCK_SIZES = {
     "cuda": [16, 64],  # CUDA supports both standard and extended block sizes
     "hip": [16, 1],  # HIP requires special handling for block_size=1
-    # "cpu": [16]  # CPU uses fixed block size from test cases
-    "cpu": []  # FIXME(woosuk): Temporarily disable CPU tests
+    "cpu": [16]  # CPU uses fixed block size from test cases
 }
 
 
@@ -82,14 +81,14 @@ def test_env(
         m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
 
         if device == "cpu":
-            if not use_v1:
-                pytest.skip("CPU backend only supports V1")
-
             with patch("vllm.attention.selector.current_platform",
                        CpuPlatform()):
                 backend = get_attn_backend(16, torch.float16, torch.float16,
                                            block_size, False)
-            assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
+            if use_v1:
+                assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
+            else:
+                assert backend.get_name() == "TORCH_SDPA"
 
         elif device == "hip":
             with patch("vllm.attention.selector.current_platform",
@@ -205,14 +204,12 @@ def test_fp32_fallback(
         m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
 
         if device == "cpu":
-            if not use_v1:
-                pytest.skip("CPU backend only supports V1")
-
             with patch("vllm.attention.selector.current_platform",
                        CpuPlatform()):
                 backend = get_attn_backend(16, torch.float32, torch.float32,
                                            16, False)
-            assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
+            assert (backend.get_name() == "TORCH_SDPA_VLLM_V1"
+                    if use_v1 else "TORCH_SDPA")
 
         elif device == "cuda":
             with patch("vllm.attention.selector.current_platform",
 
@@ -957,6 +957,7 @@ def test_convert_ids_list_to_tokens():
     ]
     tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
     assert tokens == ['Hello', ',', ' world', '!']
+<<<<<<< HEAD
 
 
 def test_current_stream_multithread():
@@ -995,3 +996,5 @@ def child_thread_func():
         child_thread.join(timeout=5)
         if child_thread.is_alive():
             pytest.fail("Child thread failed to exit properly")
+=======
+>>>>>>> origin/features-based-on-v0.9.2