From fc60757cd0be5aa39ef31692793b582da0926ea7 Mon Sep 17 00:00:00 2001 From: glenliu21 Date: Tue, 20 Jan 2026 23:22:20 -0500 Subject: [PATCH 1/3] docs and cleanup --- docs/advanced_features/lora.ipynb | 150 ++++++++++++++++++++++-- python/sglang/srt/managers/tp_worker.py | 4 - 2 files changed, 141 insertions(+), 13 deletions(-) diff --git a/docs/advanced_features/lora.ipynb b/docs/advanced_features/lora.ipynb index 082a5c279240..ba73ce8672aa 100644 --- a/docs/advanced_features/lora.ipynb +++ b/docs/advanced_features/lora.ipynb @@ -394,21 +394,21 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "### OpenAI-compatible API usage\n", - "\n", - "You can use LoRA adapters via the OpenAI-compatible APIs by specifying the adapter in the `model` field using the `base-model:adapter-name` syntax (for example, `qwen/qwen2.5-0.5b-instruct:adapter_a`). For more details and examples, see the “Using LoRA Adapters” section in the OpenAI API documentation: [openai_api_completions.ipynb](../basic_usage/openai_api_completions.ipynb).\n" + "terminate_process(server_process)" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "terminate_process(server_process)" + "### OpenAI-compatible API usage\n", + "\n", + "You can use LoRA adapters via the OpenAI-compatible APIs by specifying the adapter in the `model` field using the `base-model:adapter-name` syntax (for example, `qwen/qwen2.5-0.5b-instruct:adapter_a`). For more details and examples, see the “Using LoRA Adapters” section in the OpenAI API documentation: [openai_api_completions.ipynb](../basic_usage/openai_api_completions.ipynb).\n" ] }, { @@ -569,6 +569,132 @@ "terminate_process(server_process)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## LoRA Overlap Loading" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By using the `--enable-lora-overlap-loading` server argument, the SGLang engine is able to overlap the loading of LoRA weights with prefill and decode compute, essentially hiding the data movement for LoRA weights behind GPU computation. Our benchmarks show that under adversarial conditions, enabling this feature can result in a ~35% reduction in median TTFT - (see the [LoRA overlap loading PR](https://github.com/sgl-project/sglang/pull/15512) for detailed benchmarks)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lora0 = \"Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16\"\n", + "lora1 = \"algoprog/fact-generation-llama-3.1-8b-instruct-lora\"\n", + "lora2 = \"philschmid/code-llama-3-1-8b-text-to-sql-lora\"\n", + "\n", + "\n", + "server_process, port = launch_server_cmd(\n", + " \"\"\"\n", + " python3 -m sglang.launch_server \\\n", + " --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", + " --enable-lora \\\n", + " --enable-lora-overlap-loading \\\n", + " --lora-paths lora0=Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16 \\\n", + " lora1=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n", + " lora2=philschmid/code-llama-3-1-8b-text-to-sql-lora \\\n", + " --max-lora-rank 256 \\\n", + " --max-loras-per-batch 2\n", + " \"\"\"\n", + ")\n", + "\n", + "url = f\"http://127.0.0.1:{port}\"\n", + "wait_for_server(url)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "json_data = {\n", + " \"text\": [\n", + " \"Write a very long fairy-tale.\",\n", + " \"List 3 countries and their capitals.\",\n", + " \"List 3 countries and their capitals.\",\n", + " ],\n", + " \"sampling_params\": [\n", + " {\"max_new_tokens\": 1024, \"temperature\": 0},\n", + " {\"max_new_tokens\": 64, \"temperature\": 0},\n", + " {\"max_new_tokens\": 64, \"temperature\": 0},\n", + " ],\n", + " \"lora_path\": [\"lora0\", \"lora1\", \"lora2\"],\n", + "}\n", + "\n", + "# lora0 and lora1 will be loaded into the memory pool first, and because max_loras_per_batch = 2, lora2's request will remain in the queue.\n", + "# lora1's request will likely finish first, and once it does, lora2 will be loaded. With --enable-lora-overlap-loading, this loading will\n", + "# occur asynchronously and thus decoding for lora0's request won't be blocked.\n", + "response = requests.post(\n", + " url + \"/generate\",\n", + " json=json_data,\n", + ")\n", + "\n", + "for i in range(3):\n", + " print(f\"Output from lora{i}: \\n{response.json()[i]['text']}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "terminate_process(server_process)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Limitations of LoRA Overlap Loading" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "However, LoRA overlap loading is not free and comes with two important caveats:\n", + "\n", + "1. **Pinned CPU memory requirement**:\n", + " Asynchronous H2D memory copies require LoRA weights to be pinned in CPU memory, which is a finite system resource. To mitigate excessive pinned-memory usage, SGLang currently restricts `max_loaded_loras` to be at most 2× `max_loras_per_batch` when LoRA overlap loading is enabled.\n", + "\n", + "2. **Reduced multi-adapter prefill batching**:\n", + " With overlap loading, adapters become available on the GPU at different times because each adapter is loaded asynchronously. This can reduce the scheduler’s ability to form multi-adapter prefill batches, since only requests whose adapters are currently loaded can be grouped together. As a result, requests for different adapters will be scheduled in separate (or smaller) prefill batches, which can increase TTFT when adapter load time is small compared to prefill compute time. This is why LoRA overlap loading is disabled by default: it should only be enabled when users have determined that LoRA weight loading is a bottleneck (EG high adapter churn, heavy adapter weights, or PCIe-bottlenecked workloads).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example When Overlap Loading Results in Higher Latency" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For instance, suppose we have four LoRA adapters: `lora0`, `lora1`, `lora2`, and `lora3`. Loading any adapter takes 2ms, while the prefill step for requests for that adapter takes 20ms.\n", + "\n", + "1. **Baseline**:\n", + " The engine loads all four adapters synchronously, then runs one combined prefill batch, giving us a total time of ≈ `2 * 4 + 20 = 28ms`\n", + "\n", + "2. **With LoRA overlap loading enabled**:\n", + " The engine begins loading `lora0` and, once it is ready, schedules a prefill batch containing only `lora0` while `lora1` loads in the background. Then it schedules `lora1`’s prefill while `lora2` loads, and so on. In the worst case where prefill cannot be batched across adapters, total time is ≈ `2 + 4 * 20 = 82ms`\n", + "\n", + "In this scenario, overlap loading reduces adapter-load overhead, but the loss of multi-adapter prefill batching dominates and leads to higher TTFT." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -580,6 +706,11 @@ } ], "metadata": { + "kernelspec": { + "display_name": "sglang", + "language": "python", + "name": "python3" + }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -589,7 +720,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.10.13" } }, "nbformat": 4, diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index af4000729e76..288c20e0e243 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -195,10 +195,6 @@ def load_lora_adapter_from_tensors( ) return result - def can_run_lora_batch(self, lora_ids: list[str]) -> bool: - lora_ids_set = set(lora_ids) if isinstance(lora_ids, list) else lora_ids - return self.model_runner.lora_manager.validate_lora_batch(lora_ids_set) - def forward_batch_embedding(self, model_worker_batch: ModelWorkerBatch): forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner) logits_output = self.model_runner.forward(forward_batch).logits_output From 828e45ae6d1167cd505a1c25324c62be1a9ca19b Mon Sep 17 00:00:00 2001 From: glenliu21 Date: Wed, 21 Jan 2026 00:07:57 -0500 Subject: [PATCH 2/3] fix docs --- docs/advanced_features/lora.ipynb | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/docs/advanced_features/lora.ipynb b/docs/advanced_features/lora.ipynb index ba73ce8672aa..635063f58f1b 100644 --- a/docs/advanced_features/lora.ipynb +++ b/docs/advanced_features/lora.ipynb @@ -706,11 +706,6 @@ } ], "metadata": { - "kernelspec": { - "display_name": "sglang", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -720,8 +715,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" + "pygments_lexer": "ipython3" } }, "nbformat": 4, From 2844409f6d15b4ce5f63b541485fe234621fa474 Mon Sep 17 00:00:00 2001 From: glenliu21 Date: Wed, 21 Jan 2026 09:04:09 -0500 Subject: [PATCH 3/3] fix and small cleanups --- docs/advanced_features/lora.ipynb | 3 ++- python/sglang/srt/lora/backend/lora_registry.py | 3 ++- python/sglang/srt/lora/lora_manager.py | 2 +- python/sglang/srt/model_executor/model_runner.py | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/advanced_features/lora.ipynb b/docs/advanced_features/lora.ipynb index 635063f58f1b..a8245f1b280c 100644 --- a/docs/advanced_features/lora.ipynb +++ b/docs/advanced_features/lora.ipynb @@ -604,7 +604,8 @@ " lora1=algoprog/fact-generation-llama-3.1-8b-instruct-lora \\\n", " lora2=philschmid/code-llama-3-1-8b-text-to-sql-lora \\\n", " --max-lora-rank 256 \\\n", - " --max-loras-per-batch 2\n", + " --max-loras-per-batch 2 \\\n", + " --max-loaded-loras 4\n", " \"\"\"\n", ")\n", "\n", diff --git a/python/sglang/srt/lora/backend/lora_registry.py b/python/sglang/srt/lora/backend/lora_registry.py index 160ca8e3ee3d..15b531ab32b7 100644 --- a/python/sglang/srt/lora/backend/lora_registry.py +++ b/python/sglang/srt/lora/backend/lora_registry.py @@ -1,4 +1,5 @@ import logging +from typing import Type from sglang.srt.lora.backend.base_backend import BaseLoRABackend @@ -50,7 +51,7 @@ def create_flashinfer_backend(): ) -def get_backend_from_name(name: str) -> BaseLoRABackend: +def get_backend_from_name(name: str) -> Type[BaseLoRABackend]: """ Get corresponding backend class from backend's name """ diff --git a/python/sglang/srt/lora/lora_manager.py b/python/sglang/srt/lora/lora_manager.py index 6d8769e6971b..b5d38dcd08d0 100644 --- a/python/sglang/srt/lora/lora_manager.py +++ b/python/sglang/srt/lora/lora_manager.py @@ -55,13 +55,13 @@ def __init__( max_loras_per_batch: int, load_config: LoadConfig, dtype: torch.dtype, + server_args: ServerArgs, lora_backend: str = "triton", tp_size: int = 1, tp_rank: int = 0, max_lora_rank: Optional[int] = None, target_modules: Optional[Iterable[str]] = None, lora_paths: Optional[List[LoRARef]] = None, - server_args: Optional[ServerArgs] = None, ): self.base_model: torch.nn.Module = base_model self.base_hf_config: AutoConfig = base_hf_config diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 7731c1e4b92d..e9ceb0000cd1 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -1415,13 +1415,13 @@ def init_lora_manager(self): max_loras_per_batch=self.server_args.max_loras_per_batch, load_config=self.load_config, dtype=self.dtype, + server_args=self.server_args, lora_backend=self.server_args.lora_backend, tp_size=self.tp_size, tp_rank=self.tp_rank, max_lora_rank=self.server_args.max_lora_rank, target_modules=self.server_args.lora_target_modules, lora_paths=self.server_args.lora_paths, - server_args=self.server_args, ) def load_lora_adapter(self, lora_ref: LoRARef):