From 47f6a0158c9fca517245d65e745eb2d50e128ff2 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 15 Feb 2024 00:36:24 +0000 Subject: [PATCH 1/3] Fix punica import --- vllm/lora/punica.py | 335 ++++++++++++++++++++++---------------------- 1 file changed, 165 insertions(+), 170 deletions(-) diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index bcb73ccc19b0..e6559f7a104e 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -4,173 +4,168 @@ import torch -import_exc = None - -try: - import vllm._punica_C as punica_kernels -except ImportError as e: - import_exc = e - -if import_exc is None: - - def bgmv( - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, - ): - """ - Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0) - - Args: - y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. - x: Shape: `[B, H1]`. Input vectors. - w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight - matrices. - indicies: Shape: `[B]`. Indices of the weight matrices. - layer_idx: Layer index of the weight matrices. - scale: Scaling factor. - """ - punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale) - - def add_lora(y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, - *, - buffer: Optional[torch.Tensor] = None): - """ - Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0) - - Args: - y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. - x: Shape: `[B, H1]`. Input vectors. - wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed - LoRA A matrices. - wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed - LoRA B matrices. - indicies: Shape: `[B]`. Indices of the LoRA weights. - layer_idx: Layer index of LoRA weights. - scale: Scaling factor. - buffer: Optional. Shape: `[B, R]`. Temporary buffer. - """ - r = wb_t_all.size(-1) - if buffer is None: - # We set the buffer to be float32 by default to avoid - # numerical innacuracies that would otherwise happen - # due to downcasting. - buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) - punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, - 1.0) - punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx, - scale) - - def add_lora_slice(y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, - y_offset: int, - y_slice_size: int, - *, - buffer: Optional[torch.Tensor] = None): - """ - Same as `add_lora` but you can operate on slices of y. - Pass whole y, define y_offset and y_slice_size. - - Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0) - - Args: - y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. - x: Shape: `[B, H1]`. Input vectors. - wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed - LoRA A matrices. - wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed - LoRA B matrices. - indicies: Shape: `[B]`. Indices of the LoRA weights. - layer_idx: Layer index of LoRA weights. - scale: Scaling factor. - y_offset: Offset to apply to the starting column of y. - y_slice_size: Size of the y column slice. - """ - r = wb_t_all.size(-1) - if buffer is None: - # We set the buffer to be float32 by default to avoid - # numerical inaccuracies that would otherwise happen - # due to downcasting. - buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) - punica_kernels.dispatch_bgmv_low_level( - buffer, - x, - wa_t_all, - indicies, - layer_idx, - 1.0, - x.size(1), - buffer.size(1), - 0, - ) - punica_kernels.dispatch_bgmv_low_level( - y, - buffer, - wb_t_all, - indicies, - layer_idx, - scale, - buffer.size(1), - y_slice_size, - y_offset, - ) - -else: - - def _raise_exc( - *args, # pylint: disable=unused-argument - **kwargs # pylint: disable=unused-argument - ): - if torch.cuda.get_device_capability() < (8, 0): - raise ImportError("punica LoRA kernels require compute " - "capability>=8.0") from import_exc - else: - raise ImportError( - "punica LoRA kernels could not be imported. If you built vLLM " - "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var " - "was set.") from import_exc - - bgmv = _raise_exc - add_lora = _raise_exc - add_lora_slice = _raise_exc - -__all__ = [ - "bgmv", - "add_lora", - "add_lora_slice", -] + +def _raise_import_error(e): + if torch.cuda.get_device_capability() < (8, 0): + raise ImportError("punica LoRA kernels require compute " + "capability>=8.0") from e + else: + raise ImportError( + "punica LoRA kernels could not be imported. If you built vLLM " + "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var " + "was set.") from e + + +def bgmv( + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + indicies: torch.LongTensor, + layer_idx: int, + scale: float, +): + """ + Semantics: + y[i] += ( + x[i].unsqueeze(0) + @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + * scale + ).squeeze(0) + + Args: + y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. + x: Shape: `[B, H1]`. Input vectors. + w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight + matrices. + indicies: Shape: `[B]`. Indices of the weight matrices. + layer_idx: Layer index of the weight matrices. + scale: Scaling factor. + """ + try: + import vllm._punica_C as punica_kernels + except ImportError as e: + _raise_import_error(e) + + punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale) + + +def add_lora(y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + indicies: torch.LongTensor, + layer_idx: int, + scale: float, + *, + buffer: Optional[torch.Tensor] = None): + """ + Semantics: + y[i] += ( + x[i].unsqueeze(0) + @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + * scale + ).squeeze(0) + + Args: + y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. + x: Shape: `[B, H1]`. Input vectors. + wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed + LoRA A matrices. + wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed + LoRA B matrices. + indicies: Shape: `[B]`. Indices of the LoRA weights. + layer_idx: Layer index of LoRA weights. + scale: Scaling factor. + buffer: Optional. Shape: `[B, R]`. Temporary buffer. + """ + try: + import vllm._punica_C as punica_kernels + except ImportError as e: + _raise_import_error(e) + + r = wb_t_all.size(-1) + if buffer is None: + # We set the buffer to be float32 by default to avoid + # numerical innacuracies that would otherwise happen + # due to downcasting. + buffer = torch.zeros((x.size(0), r), + dtype=torch.float32, + device=x.device) + punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, + 1.0) + punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx, + scale) + + +def add_lora_slice(y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + indicies: torch.LongTensor, + layer_idx: int, + scale: float, + y_offset: int, + y_slice_size: int, + *, + buffer: Optional[torch.Tensor] = None): + """ + Same as `add_lora` but you can operate on slices of y. + Pass whole y, define y_offset and y_slice_size. + + Semantics: + y[i] += ( + x[i].unsqueeze(0) + @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) + * scale + ).squeeze(0) + + Args: + y: Shape: `[B, H2]`. Output vectors. Will be changed in-place. + x: Shape: `[B, H1]`. Input vectors. + wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed + LoRA A matrices. + wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed + LoRA B matrices. + indicies: Shape: `[B]`. Indices of the LoRA weights. + layer_idx: Layer index of LoRA weights. + scale: Scaling factor. + y_offset: Offset to apply to the starting column of y. + y_slice_size: Size of the y column slice. + """ + try: + import vllm._punica_C as punica_kernels + except ImportError as e: + _raise_import_error(e) + + r = wb_t_all.size(-1) + if buffer is None: + # We set the buffer to be float32 by default to avoid + # numerical inaccuracies that would otherwise happen + # due to downcasting. + buffer = torch.zeros((x.size(0), r), + dtype=torch.float32, + device=x.device) + punica_kernels.dispatch_bgmv_low_level( + buffer, + x, + wa_t_all, + indicies, + layer_idx, + 1.0, + x.size(1), + buffer.size(1), + 0, + ) + punica_kernels.dispatch_bgmv_low_level( + y, + buffer, + wb_t_all, + indicies, + layer_idx, + scale, + buffer.size(1), + y_slice_size, + y_offset, + ) From c435bbfe02db7115a44792f131b8be5366fa884f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 15 Feb 2024 00:42:03 +0000 Subject: [PATCH 2/3] yapf --- vllm/lora/punica.py | 55 ++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index e6559f7a104e..307a33dcf282 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -7,8 +7,8 @@ def _raise_import_error(e): if torch.cuda.get_device_capability() < (8, 0): - raise ImportError("punica LoRA kernels require compute " - "capability>=8.0") from e + raise ImportError( + "punica LoRA kernels require compute capability >= 8.0") from e else: raise ImportError( "punica LoRA kernels could not be imported. If you built vLLM " @@ -50,14 +50,14 @@ def bgmv( def add_lora(y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, - *, - buffer: Optional[torch.Tensor] = None): + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + indicies: torch.LongTensor, + layer_idx: int, + scale: float, + *, + buffer: Optional[torch.Tensor] = None): """ Semantics: y[i] += ( @@ -83,32 +83,31 @@ def add_lora(y: torch.Tensor, import vllm._punica_C as punica_kernels except ImportError as e: _raise_import_error(e) - + r = wb_t_all.size(-1) if buffer is None: # We set the buffer to be float32 by default to avoid # numerical innacuracies that would otherwise happen # due to downcasting. buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) - punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, - 1.0) + dtype=torch.float32, + device=x.device) + punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0) punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx, - scale) + scale) def add_lora_slice(y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - indicies: torch.LongTensor, - layer_idx: int, - scale: float, - y_offset: int, - y_slice_size: int, - *, - buffer: Optional[torch.Tensor] = None): + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + indicies: torch.LongTensor, + layer_idx: int, + scale: float, + y_offset: int, + y_slice_size: int, + *, + buffer: Optional[torch.Tensor] = None): """ Same as `add_lora` but you can operate on slices of y. Pass whole y, define y_offset and y_slice_size. @@ -145,8 +144,8 @@ def add_lora_slice(y: torch.Tensor, # numerical inaccuracies that would otherwise happen # due to downcasting. buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) + dtype=torch.float32, + device=x.device) punica_kernels.dispatch_bgmv_low_level( buffer, x, From 64e36dafe65bcd20a7ac524cbdea0769b21f2188 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 15 Feb 2024 05:12:45 +0000 Subject: [PATCH 3/3] Add test_gc --- tests/test_regression.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/test_regression.py b/tests/test_regression.py index c48e474bd889..cb68e9ecfc06 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -4,6 +4,10 @@ will never happen again. """ +import gc + +import torch + from vllm import LLM, SamplingParams @@ -35,6 +39,20 @@ def test_max_tokens_none(): assert len(prompts) == len(outputs) +def test_gc(): + llm = LLM("facebook/opt-125m", enforce_eager=True) + del llm + + gc.collect() + torch.cuda.empty_cache() + + # The memory allocated for model and KV cache should be released. + # The memory allocated for PyTorch and others should be less than 50MB. + # Usually, it's around 10MB. + allocated = torch.cuda.memory_allocated() + assert allocated < 50 * 1024 * 1024 + + if __name__ == "__main__": import pytest pytest.main([__file__])