|
| 1 | +import os |
1 | 2 | import random |
2 | 3 | from collections.abc import Iterable |
3 | 4 | from typing import Dict, List, Optional |
|
18 | 19 |
|
19 | 20 | from ..model_config import ModelConfig |
20 | 21 | from ..speculative import get_spec_decoder |
| 22 | +from .config import PyTorchConfig |
21 | 23 | from .config_utils import is_mla, is_nemotron_hybrid |
22 | 24 | from .kv_cache_transceiver import AttentionTypeCpp, create_kv_cache_transceiver |
23 | 25 | from .llm_request import ExecutorResponse |
@@ -718,3 +720,45 @@ def _try_infer_num_experts(model_config: ModelConfig) -> int: |
718 | 720 | return 1 |
719 | 721 |
|
720 | 722 | return num_experts |
| 723 | + |
| 724 | + |
| 725 | +def _adjust_torch_mem_fraction(pytorch_backend_config: PyTorchConfig): |
| 726 | + # FIXME: PyTorch only uses the garbage_collection_threshold setting |
| 727 | + # if a memory fraction is set, cf. |
| 728 | + # https://github.com/pytorch/pytorch/blob/cd995bfb2aac8891465809be3ce29543bd524287/c10/cuda/CUDACachingAllocator.cpp#L1357 |
| 729 | + logger.debug("Setting PyTorch memory fraction to 1.0") |
| 730 | + torch.cuda.set_per_process_memory_fraction(1.0) |
| 731 | + |
| 732 | + # FIXME: As soon as |
| 733 | + # torch.cuda._set_allocator_settings (added in PyTorch 2.8.0-rc1) |
| 734 | + # or a similar API is available, the warning below should be removed |
| 735 | + # and the allocator GC threshold be set via the new API instead. |
| 736 | + torch_allocator_config = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "") |
| 737 | + torch_mem_threshold_advised = ( |
| 738 | + torch.cuda.get_allocator_backend() == "native" |
| 739 | + and "expandable_segments:True" not in torch_allocator_config) |
| 740 | + torch_mem_threshold_set = "garbage_collection_threshold:" in torch_allocator_config |
| 741 | + if torch_mem_threshold_advised and not torch_mem_threshold_set: |
| 742 | + logger.warning( |
| 743 | + "It is recommended to incl. 'garbage_collection_threshold:0.???' or 'backend:cudaMallocAsync'" |
| 744 | + " or 'expandable_segments:True' in PYTORCH_CUDA_ALLOC_CONF.") |
| 745 | + |
| 746 | + # NOTE: Even if a memory threshold was not set (cf. warning above), setting a memory |
| 747 | + # fraction < 1.0 is beneficial, because |
| 748 | + # https://github.com/pytorch/pytorch/blob/5228986c395dc79f90d2a2b991deea1eef188260/c10/cuda/CUDACachingAllocator.cpp#L2719 |
| 749 | + # and |
| 750 | + # https://github.com/pytorch/pytorch/blob/5228986c395dc79f90d2a2b991deea1eef188260/c10/cuda/CUDACachingAllocator.cpp#L1240 |
| 751 | + # lead PyTorch to release all unused memory before hitting the set fraction. This |
| 752 | + # still mitigates OOM, although at a higher performance impact, because it |
| 753 | + # effectively resets the allocator cache. |
| 754 | + if not pytorch_backend_config._limit_torch_cuda_mem_fraction: |
| 755 | + return |
| 756 | + mem_reserved = torch.cuda.memory_reserved() |
| 757 | + mem_free, mem_total = torch.cuda.mem_get_info() |
| 758 | + safety_margin = 32 * 1024**2 |
| 759 | + mem_torch_max = mem_free + mem_reserved - safety_margin |
| 760 | + mem_torch_fraction = mem_torch_max / mem_total |
| 761 | + logger.info( |
| 762 | + f"Setting PyTorch memory fraction to {mem_torch_fraction} ({mem_torch_max / 1024**3} GiB)" |
| 763 | + ) |
| 764 | + torch.cuda.set_per_process_memory_fraction(mem_torch_fraction) |
0 commit comments