diff --git a/setup.py b/setup.py index 730a190a876..fafea904998 100644 --- a/setup.py +++ b/setup.py @@ -64,6 +64,7 @@ FORCE_CXX11_ABI = os.getenv("FLASH_ATTENTION_FORCE_CXX11_ABI", "FALSE") == "TRUE" USE_TRITON_ROCM = os.getenv("FLASH_ATTENTION_TRITON_AMD_ENABLE", "FALSE") == "TRUE" SKIP_CK_BUILD = os.getenv("FLASH_ATTENTION_SKIP_CK_BUILD", "TRUE") == "TRUE" if USE_TRITON_ROCM else False +NVCC_THREADS = os.getenv("NVCC_THREADS") or "4" @functools.lru_cache(maxsize=None) def cuda_archs() -> str: @@ -186,8 +187,7 @@ def detect_hipify_v2(): def append_nvcc_threads(nvcc_extra_args): - nvcc_threads = os.getenv("NVCC_THREADS") or "4" - return nvcc_extra_args + ["--threads", nvcc_threads] + return nvcc_extra_args + ["--threads", NVCC_THREADS] def rename_cpp_to_cu(cpp_files): @@ -571,15 +571,23 @@ def __init__(self, *args, **kwargs) -> None: if not os.environ.get("MAX_JOBS"): import psutil + nvcc_threads = max(1, int(NVCC_THREADS)) + # calculate the maximum allowed NUM_JOBS based on cores max_num_jobs_cores = max(1, os.cpu_count() // 2) # calculate the maximum allowed NUM_JOBS based on free memory free_memory_gb = psutil.virtual_memory().available / (1024 ** 3) # free memory in GB - max_num_jobs_memory = int(free_memory_gb / 9) # each JOB peak memory cost is ~8-9GB when threads = 4 + # Assume worst-case peak observed memory usage of ~5GB per NVCC thread. + # Limit: peak_threads = max_jobs * nvcc_threads and peak_threads * 5GB <= free_memory. + max_num_jobs_memory = max(1, int(free_memory_gb / (5 * nvcc_threads))) # pick lower value of jobs based on cores vs memory metric to minimize oom and swap usage during compilation max_jobs = max(1, min(max_num_jobs_cores, max_num_jobs_memory)) + print( + f"Auto set MAX_JOBS to `{max_jobs}`, NVCC_THREADS to `{nvcc_threads}`. " + "If you see memory pressure, please use a lower `MAX_JOBS=N` or `NVCC_THREADS=N` value." + ) os.environ["MAX_JOBS"] = str(max_jobs) super().__init__(*args, **kwargs)