Skip to content

Commit

Permalink
Merge branch 'main' into test-image-hf-transfer
Browse files Browse the repository at this point in the history
  • Loading branch information
mgoin authored Nov 7, 2024
2 parents 2f0ed5e + ae62fd1 commit 67bfc08
Show file tree
Hide file tree
Showing 141 changed files with 1,659 additions and 707 deletions.
2 changes: 1 addition & 1 deletion .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ docker exec cpu-test bash -c "
docker exec cpu-test bash -c "
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=48-92
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
--backend vllm \
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ steps:
- tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile
commands:
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
Expand Down Expand Up @@ -431,7 +432,6 @@ steps:
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py

- label: Multi-step Tests (4 GPUs) # 36min
working_dir: "/vllm-workspace/tests"
Expand Down
4 changes: 3 additions & 1 deletion .github/mergify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ pull_request_rules:
comment:
message: |
This pull request has merge conflicts that must be resolved before it can be
merged. @{{author}} please rebase it. https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
merged. Please rebase the PR, @{{author}}.
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
- name: remove 'needs-rebase' label when conflict is resolved
conditions:
Expand Down
45 changes: 45 additions & 0 deletions .github/workflows/codespell.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: codespell

on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
paths:
- "**/*.py"
- "**/*.md"
- "**/*.rst"
- pyproject.toml
- requirements-lint.txt
- .github/workflows/codespell.yml
pull_request:
branches:
- main
paths:
- "**/*.py"
- "**/*.md"
- "**/*.rst"
- pyproject.toml
- requirements-lint.txt
- .github/workflows/codespell.yml

jobs:
codespell:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements-lint.txt
- name: Spelling check with codespell
run: |
codespell --toml pyproject.toml
2 changes: 1 addition & 1 deletion .github/workflows/mypy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,4 @@ jobs:
- name: Mypy
run: |
echo "::add-matcher::.github/workflows/matchers/mypy.json"
tools/mypy.sh 1
tools/mypy.sh 1 ${{ matrix.python-version }}
17 changes: 11 additions & 6 deletions .github/workflows/ruff.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,17 @@ on:
pull_request:
branches:
- main
paths:
- "**/*.py"
- pyproject.toml
- requirements-lint.txt
- .github/workflows/matchers/ruff.json
- .github/workflows/ruff.yml
# This workflow is only relevant when one of the following files changes.
# However, we have github configured to expect and require this workflow
# to run and pass before github with auto-merge a pull request. Until github
# allows more flexible auto-merge policy, we can just run this on every PR.
# It doesn't take that long to run, anyway.
#paths:
# - "**/*.py"
# - pyproject.toml
# - requirements-lint.txt
# - .github/workflows/matchers/ruff.json
# - .github/workflows/ruff.yml

jobs:
ruff:
Expand Down
3 changes: 3 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER 1

# Copy in the v1 package for testing (it isn't distributed yet)
COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1

# doc requires source code
# we hide them inside `test_docs/` , so that this source code
# will not be imported by other tests
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li

RUN echo 'ulimit -c 0' >> ~/.bashrc

RUN pip install intel_extension_for_pytorch==2.4.0
RUN pip install intel_extension_for_pytorch==2.5.0

WORKDIR /workspace

Expand Down
12 changes: 11 additions & 1 deletion Dockerfile.xpu
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,19 @@ COPY requirements-common.txt /workspace/vllm/requirements-common.txt

RUN --mount=type=cache,target=/root/.cache/pip \
pip install --no-cache-dir \
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
-r requirements-xpu.txt

RUN git clone https://github.com/intel/pti-gpu && \
cd pti-gpu/sdk && \
git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
mkdir build && \
cd build && \
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
make -j && \
cmake --install . --config Release --prefix "/usr/local"

ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"

COPY . .
ARG GIT_REPO_CHECK
RUN --mount=type=bind,source=.git,target=.git \
Expand Down
57 changes: 52 additions & 5 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,17 +297,43 @@ def sample_random_requests(
async def get_request(
input_requests: List[Tuple[str, int, int]],
request_rate: float,
burstiness: float = 1.0,
) -> AsyncGenerator[Tuple[str, int, int], None]:
"""
Asynchronously generates requests at a specified rate
with OPTIONAL burstiness.
Args:
input_requests:
A list of input requests, each represented as a tuple.
request_rate:
The rate at which requests are generated (requests/s).
burstiness (optional):
The burstiness factor of the request generation.
Only takes effect when request_rate is not inf.
Default value is 1, which follows a Poisson process.
Otherwise, the request intervals follow a gamma distribution.
A lower burstiness value (0 < burstiness < 1) results
in more bursty requests, while a higher burstiness value
(burstiness > 1) results in a more uniform arrival of requests.
"""
input_requests = iter(input_requests)

# Calculate scale parameter theta to maintain the desired request_rate.
assert burstiness > 0, (
f"A positive burstiness factor is expected, but given {burstiness}.")
theta = 1.0 / (request_rate * burstiness)

for request in input_requests:
yield request

if request_rate == float("inf"):
# If the request rate is infinity, then we don't need to wait.
continue

# Sample the request interval from the exponential distribution.
interval = np.random.exponential(1.0 / request_rate)
# Sample the request interval from the gamma distribution.
# If burstiness is 1, it follows exponential distribution.
interval = np.random.gamma(shape=burstiness, scale=theta)
# The next request will be sent after the interval.
await asyncio.sleep(interval)

Expand Down Expand Up @@ -426,6 +452,7 @@ async def benchmark(
logprobs: Optional[int],
best_of: int,
request_rate: float,
burstiness: float,
disable_tqdm: bool,
profile: bool,
selected_percentile_metrics: List[str],
Expand Down Expand Up @@ -480,7 +507,13 @@ async def benchmark(
if profile_output.success:
print("Profiler started")

if burstiness == 1.0:
distribution = "Poisson process"
else:
distribution = "Gamma distribution"

print(f"Traffic request rate: {request_rate}")
print(f"Burstiness factor: {burstiness} ({distribution})")
print(f"Maximum request concurrency: {max_concurrency}")

pbar = None if disable_tqdm else tqdm(total=len(input_requests))
Expand All @@ -502,7 +535,7 @@ async def limited_request_func(request_func_input, pbar):

benchmark_start_time = time.perf_counter()
tasks: List[asyncio.Task] = []
async for request in get_request(input_requests, request_rate):
async for request in get_request(input_requests, request_rate, burstiness):
prompt, prompt_len, output_len, mm_content = request
request_func_input = RequestFuncInput(model=model_id,
prompt=prompt,
Expand Down Expand Up @@ -769,6 +802,7 @@ def main(args: argparse.Namespace):
logprobs=args.logprobs,
best_of=args.best_of,
request_rate=args.request_rate,
burstiness=args.burstiness,
disable_tqdm=args.disable_tqdm,
profile=args.profile,
selected_percentile_metrics=args.percentile_metrics.split(","),
Expand Down Expand Up @@ -807,6 +841,7 @@ def main(args: argparse.Namespace):
# Traffic
result_json["request_rate"] = (
args.request_rate if args.request_rate < float("inf") else "inf")
result_json["burstiness"] = args.burstiness
result_json["max_concurrency"] = args.max_concurrency

# Merge with benchmark result
Expand Down Expand Up @@ -922,8 +957,20 @@ def main(args: argparse.Namespace):
default=float("inf"),
help="Number of requests per second. If this is inf, "
"then all the requests are sent at time 0. "
"Otherwise, we use Poisson process to synthesize "
"the request arrival times.",
"Otherwise, we use Poisson process or gamma distribution "
"to synthesize the request arrival times.",
)
parser.add_argument(
"--burstiness",
type=float,
default=1.0,
help="Burstiness factor of the request generation. "
"Only take effect when request_rate is not inf. "
"Default value is 1, which follows Poisson process. "
"Otherwise, the request intervals follow a gamma distribution. "
"A lower burstiness value (0 < burstiness < 1) results in more "
"bursty requests. A higher burstiness value (burstiness > 1) "
"results in a more uniform arrival of requests.",
)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument(
Expand Down
3 changes: 2 additions & 1 deletion cmake/cpu_extension.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
#
list(APPEND CXX_COMPILE_FLAGS
"-fopenmp"
"-mf16c"
"-DVLLM_CPU_EXTENSION")

execute_process(COMMAND cat /proc/cpuinfo
Expand Down Expand Up @@ -92,7 +93,7 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
GIT_TAG v3.5.3
GIT_TAG v3.6
GIT_PROGRESS TRUE
GIT_SHALLOW TRUE
)
Expand Down
10 changes: 10 additions & 0 deletions csrc/cpu/attention.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,16 @@ struct KernelVecType<float> {
using v_load_vec_type = vec_op::FP32Vec16;
};

template <>
struct KernelVecType<c10::Half> {
using q_load_vec_type = vec_op::FP16Vec8;
using q_vec_type = vec_op::FP32Vec16;
using k_load_vec_type = vec_op::FP16Vec16;
using k_vec_type = vec_op::FP32Vec16;
using qk_acc_vec_type = vec_op::FP32Vec16;
using v_load_vec_type = vec_op::FP16Vec16;
};

#ifdef __AVX512BF16__
template <>
struct KernelVecType<c10::BFloat16> {
Expand Down
Loading

0 comments on commit 67bfc08

Please sign in to comment.