Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docker/Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get update -y \
&& apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof xz-utils \
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof make xz-utils \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
&& curl -LsSf https://astral.sh/uv/install.sh | sh

Expand Down Expand Up @@ -154,7 +154,7 @@ WORKDIR /vllm-workspace

RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get install -y --no-install-recommends vim numactl make clangd-14
apt-get install -y --no-install-recommends vim numactl clangd-14

RUN ln -s /usr/bin/clangd-14 /usr/bin/clangd

Expand Down
31 changes: 31 additions & 0 deletions docs/models/extensions/instanttensor.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Loading Model Weights with InstantTensor

InstantTensor accelerates loading Safetensors weights on CUDA devices through distributed loading, pipelined prefetching, and direct I/O. InstantTensor also supports GDS (GPUDirect Storage) when available.
For more details, see the [InstantTensor GitHub repository](https://github.com/scitix/InstantTensor).

## Installation

```bash
pip install instanttensor
```

## Use InstantTensor in vLLM

Add `--load-format instanttensor` as a command-line argument.

For example:

```bash
vllm serve Qwen/Qwen2.5-0.5B --load-format instanttensor
```

## Benchmarks

| Model | GPU | Backend | Load Time (s) | Throughput (GB/s) | Speedup |
| --- | ---: | --- | ---: | ---: | --- |
| Qwen3-30B-A3B | 1*H200 | Safetensors | 57.4 | 1.1 | 1x |
| Qwen3-30B-A3B | 1*H200 | InstantTensor | 1.77 | 35 | <span style="color: green">**32.4x**</span> |
| DeepSeek-R1 | 8*H200 | Safetensors | 160 | 4.3 | 1x |
| DeepSeek-R1 | 8*H200 | InstantTensor | 15.3 | 45 | <span style="color: green">**10.5x**</span> |

For the full benchmark results, see <https://github.com/scitix/InstantTensor/blob/main/docs/benchmark.md>.
1 change: 1 addition & 0 deletions requirements/nightly_torch_test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,5 @@ numba == 0.61.2 # Required for N-gram speculative decoding
numpy
runai-model-streamer[s3,gcs]==0.15.3
fastsafetensors>=0.2.2
instanttensor>=0.1.5
pydantic>=2.12 # 2.11 leads to error on python 3.13
1 change: 1 addition & 0 deletions requirements/test.in
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ numba == 0.61.2 # Required for N-gram speculative decoding
numpy
runai-model-streamer[s3,gcs]==0.15.3
fastsafetensors>=0.2.2 # 0.2.2 contains important fixes for multi-GPU mem usage
instanttensor>=0.1.5
pydantic>=2.12 # 2.11 leads to error on python 3.13
decord==0.6.0
terratorch >= 1.2.2 # Required for Prithvi tests
Expand Down
3 changes: 3 additions & 0 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,8 @@ inflect==5.6.2
# via datamodel-code-generator
iniconfig==2.0.0
# via pytest
instanttensor==0.1.5
# via -r requirements/test.in
isoduration==20.11.0
# via jsonschema
isort==5.13.2
Expand Down Expand Up @@ -1169,6 +1171,7 @@ torch==2.10.0+cu129
# accelerate
# bitsandbytes
# encodec
# instanttensor
# kornia
# lightly
# lightning
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -969,6 +969,7 @@ def _read_requirements(filename: str) -> list[str]:
"bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy", "plotly"],
"tensorizer": ["tensorizer==2.10.1"],
"fastsafetensors": ["fastsafetensors >= 0.2.2"],
"instanttensor": ["instanttensor >= 0.1.5"],
"runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
"audio": [
"librosa",
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest

from vllm import SamplingParams
from vllm.platforms import current_platform

test_model = "openai-community/gpt2"

prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)


@pytest.mark.skipif(
not current_platform.is_cuda(),
reason="InstantTensor requires NVIDIA GPUs",
)
def test_model_loader_download_files(vllm_runner):
with vllm_runner(test_model, load_format="instanttensor") as llm:
deserialized_outputs = llm.generate(prompts, sampling_params)
assert deserialized_outputs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import glob
import tempfile

import huggingface_hub.constants
import pytest
import torch

from vllm.model_executor.model_loader.weight_utils import (
download_weights_from_hf,
instanttensor_weights_iterator,
safetensors_weights_iterator,
)
from vllm.platforms import current_platform


@pytest.mark.skipif(
not current_platform.is_cuda(),
reason="InstantTensor requires NVIDIA GPUs",
)
def test_instanttensor_model_loader():
with tempfile.TemporaryDirectory() as tmpdir:
huggingface_hub.constants.HF_HUB_OFFLINE = False
download_weights_from_hf(
"openai-community/gpt2", allow_patterns=["*.safetensors"], cache_dir=tmpdir
)
safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
assert len(safetensors) > 0

instanttensor_tensors = {}
hf_safetensors_tensors = {}

for name, tensor in instanttensor_weights_iterator(safetensors, True):
# Copy the tensor immediately as it is a reference to the internal
# buffer of instanttensor.
instanttensor_tensors[name] = tensor.to("cpu")

for name, tensor in safetensors_weights_iterator(safetensors, True):
hf_safetensors_tensors[name] = tensor

assert len(instanttensor_tensors) == len(hf_safetensors_tensors)

for name, instanttensor_tensor in instanttensor_tensors.items():
assert instanttensor_tensor.dtype == hf_safetensors_tensors[name].dtype
assert instanttensor_tensor.shape == hf_safetensors_tensors[name].shape
assert torch.all(instanttensor_tensor.eq(hf_safetensors_tensors[name]))


if __name__ == "__main__":
test_instanttensor_model_loader()
5 changes: 4 additions & 1 deletion vllm/config/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ class LoadConfig:
back to the pytorch bin format if safetensors format is not available.\n
- "pt" will load the weights in the pytorch bin format.\n
- "safetensors" will load the weights in the safetensors format.\n
- "instanttensor" will load the Safetensors weights on CUDA devices using
InstantTensor, which enables distributed loading with pipelined prefetching
and fast direct I/O.\n
- "npcache" will load the weights in pytorch format and store a numpy cache
to speed up the loading.\n
- "dummy" will initialize the weights with random values, which is mainly
Expand All @@ -46,7 +49,7 @@ class LoadConfig:
- "gguf" will load weights from GGUF format files (details specified in
https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n
- "mistral" will load weights from consolidated safetensors files used by
Mistral models.
Mistral models.\n
- Other custom values can be supported via plugins."""
download_dir: str | None = None
"""Directory to download and load the weights, default to the default
Expand Down
2 changes: 2 additions & 0 deletions vllm/model_executor/model_loader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
"dummy",
"fastsafetensors",
"gguf",
"instanttensor",
"mistral",
"npcache",
"pt",
Expand All @@ -51,6 +52,7 @@
"dummy": DummyModelLoader,
"fastsafetensors": DefaultModelLoader,
"gguf": GGUFModelLoader,
"instanttensor": DefaultModelLoader,
"mistral": DefaultModelLoader,
"npcache": DefaultModelLoader,
"pt": DefaultModelLoader,
Expand Down
12 changes: 11 additions & 1 deletion vllm/model_executor/model_loader/default_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
filter_duplicate_safetensors_files,
filter_files_not_needed_for_inference,
get_quant_config,
instanttensor_weights_iterator,
maybe_download_from_modelscope,
multi_thread_pt_weights_iterator,
multi_thread_safetensors_weights_iterator,
Expand Down Expand Up @@ -117,7 +118,11 @@ def _prepare_weights(
# Some quantized models use .pt files for storing the weights.
if load_format == "hf":
allow_patterns = ["*.safetensors", "*.bin"]
elif load_format == "safetensors" or load_format == "fastsafetensors":
elif (
load_format == "safetensors"
or load_format == "fastsafetensors"
or load_format == "instanttensor"
):
use_safetensors = True
allow_patterns = ["*.safetensors"]
elif load_format == "mistral":
Expand Down Expand Up @@ -209,6 +214,11 @@ def _get_weights_iterator(
hf_weights_files,
self.load_config.use_tqdm_on_load,
)
elif self.load_config.load_format == "instanttensor":
weights_iterator = instanttensor_weights_iterator(
hf_weights_files,
self.load_config.use_tqdm_on_load,
)
else:
if extra_config.get("enable_multithread_load"):
weights_iterator = multi_thread_safetensors_weights_iterator(
Expand Down
42 changes: 41 additions & 1 deletion vllm/model_executor/model_loader/weight_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from vllm import envs
from vllm.config import ModelConfig
from vllm.config.load import LoadConfig
from vllm.distributed import get_tensor_model_parallel_rank
from vllm.distributed import get_tensor_model_parallel_rank, get_world_group
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import (
QuantizationConfig,
Expand Down Expand Up @@ -897,6 +897,46 @@ def fastsafetensors_weights_iterator(
loader.close()


def instanttensor_weights_iterator(
hf_weights_files: list[str],
use_tqdm_on_load: bool,
) -> Generator[tuple[str, torch.Tensor], None, None]:
"""Iterate over the weights in the model safetensor files
using instanttensor library."""
try:
import instanttensor
except ImportError as e:
raise ImportError(
"Please install instanttensor via `pip install instanttensor`"
) from e

if not current_platform.is_cuda():
raise ValueError("InstantTensor requires NVIDIA GPUs")

try:
world_group = get_world_group()
except AssertionError:
# Entering here only in unit tests where the world group is not initialized.
process_group = None
else:
process_group = world_group.device_group if world_group.world_size > 1 else None

device = current_platform.current_device()

with instanttensor.safe_open(
hf_weights_files, framework="pt", device=device, process_group=process_group
) as f:
yield from tqdm(
f.tensors(),
desc="Loading safetensors using InstantTensor loader",
disable=not enable_tqdm(use_tqdm_on_load),
bar_format=_BAR_FORMAT,
position=tqdm._get_free_pos(),
total=len(f.keys()),
mininterval=1.0,
)


def pt_weights_iterator(
hf_weights_files: list[str],
use_tqdm_on_load: bool,
Expand Down
Loading