|
| 1 | +import os |
1 | 2 | import random |
2 | 3 | from collections.abc import Iterable |
3 | 4 | from typing import Dict, List, Optional |
|
18 | 19 |
|
19 | 20 | from ..model_config import ModelConfig |
20 | 21 | from ..speculative import get_spec_decoder |
| 22 | +from .config import PyTorchConfig |
21 | 23 | from .config_utils import is_mla, is_nemotron_hybrid |
22 | 24 | from .kv_cache_transceiver import AttentionTypeCpp, create_kv_cache_transceiver |
23 | 25 | from .llm_request import ExecutorResponse |
@@ -676,3 +678,45 @@ def _try_infer_num_experts(model_config: ModelConfig) -> int: |
676 | 678 | return 1 |
677 | 679 |
|
678 | 680 | return num_experts |
| 681 | + |
| 682 | + |
| 683 | +def _adjust_torch_mem_fraction(pytorch_backend_config: PyTorchConfig): |
| 684 | + # FIXME: PyTorch only uses the garbage_collection_threshold setting |
| 685 | + # if a memory fraction is set, cf. |
| 686 | + # https://github.com/pytorch/pytorch/blob/cd995bfb2aac8891465809be3ce29543bd524287/c10/cuda/CUDACachingAllocator.cpp#L1357 |
| 687 | + logger.debug("Setting PyTorch memory fraction to 1.0") |
| 688 | + torch.cuda.set_per_process_memory_fraction(1.0) |
| 689 | + |
| 690 | + # FIXME: As soon as |
| 691 | + # torch.cuda._set_allocator_settings (added in PyTorch 2.8.0-rc1) |
| 692 | + # or a similar API is available, the warning below should be removed |
| 693 | + # and the allocator GC threshold be set via the new API instead. |
| 694 | + torch_allocator_config = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "") |
| 695 | + torch_mem_threshold_advised = ( |
| 696 | + torch.cuda.get_allocator_backend() == "native" |
| 697 | + and "expandable_segments:True" not in torch_allocator_config) |
| 698 | + torch_mem_threshold_set = "garbage_collection_threshold:" in torch_allocator_config |
| 699 | + if torch_mem_threshold_advised and not torch_mem_threshold_set: |
| 700 | + logger.warning( |
| 701 | + "It is recommended to incl. 'garbage_collection_threshold:0.???' or 'backend:cudaMallocAsync'" |
| 702 | + " or 'expandable_segments:True' in PYTORCH_CUDA_ALLOC_CONF.") |
| 703 | + |
| 704 | + # NOTE: Even if a memory threshold was not set (cf. warning above), setting a memory |
| 705 | + # fraction < 1.0 is beneficial, because |
| 706 | + # https://github.com/pytorch/pytorch/blob/5228986c395dc79f90d2a2b991deea1eef188260/c10/cuda/CUDACachingAllocator.cpp#L2719 |
| 707 | + # and |
| 708 | + # https://github.com/pytorch/pytorch/blob/5228986c395dc79f90d2a2b991deea1eef188260/c10/cuda/CUDACachingAllocator.cpp#L1240 |
| 709 | + # lead PyTorch to release all unused memory before hitting the set fraction. This |
| 710 | + # still mitigates OOM, although at a higher performance impact, because it |
| 711 | + # effectively resets the allocator cache. |
| 712 | + if not pytorch_backend_config._limit_torch_cuda_mem_fraction: |
| 713 | + return |
| 714 | + mem_reserved = torch.cuda.memory_reserved() |
| 715 | + mem_free, mem_total = torch.cuda.mem_get_info() |
| 716 | + safety_margin = 32 * 1024**2 |
| 717 | + mem_torch_max = mem_free + mem_reserved - safety_margin |
| 718 | + mem_torch_fraction = mem_torch_max / mem_total |
| 719 | + logger.info( |
| 720 | + f"Setting PyTorch memory fraction to {mem_torch_fraction} ({mem_torch_max / 1024**3} GiB)" |
| 721 | + ) |
| 722 | + torch.cuda.set_per_process_memory_fraction(mem_torch_fraction) |
0 commit comments