From a68b211c81c3f5caeaeb38bb3f7a659251f831d4 Mon Sep 17 00:00:00 2001 From: Alexander Raistrick Date: Mon, 31 Jul 2023 12:01:39 -0400 Subject: [PATCH 1/2] Add pyadl GPU count checking, add set/get for HIP_VISIBLE_DEVICES etc --- docs/ConfiguringInfinigen.md | 6 +- docs/HelloWorld.md | 2 +- requirements.txt | 1 + worldgen/tools/util/submitit_emulator.py | 91 ++++++++++++++++-------- 4 files changed, 68 insertions(+), 32 deletions(-) diff --git a/docs/ConfiguringInfinigen.md b/docs/ConfiguringInfinigen.md index d1fa2124b..7ea329d5d 100644 --- a/docs/ConfiguringInfinigen.md +++ b/docs/ConfiguringInfinigen.md @@ -49,7 +49,7 @@ Now that you understand the two major python programs and how to configure them, # Original hello world command python -m tools.manage_datagen_jobs --output_folder outputs/hello_world --num_scenes 1 --specific_seed 0 \ --configs desert.gin simple.gin --pipeline_configs local_16GB.gin monocular.gin blender_gt.gin \ ---pipeline_overrides LocalScheduleHandler.use_gpu=False +--pipeline_overrides LocalScheduleHandler.gpu_type=None ``` Here is a breakdown of what every commandline argument does, and ideas for how you could change them / swap them out: @@ -64,7 +64,7 @@ Here is a breakdown of what every commandline argument does, and ideas for how y - `local_16GB.gin` specifies to run only a single scene at a time, and to run each task as a local python process. See [here](#configuring-available-computing-resources) for more options - `monocular.gin` specifies that we want a single image per scene, not stereo or video. See [here](#rendering-video-stereo-and-other-data-formats) for more options. - `blender_gt.gin` specifies to extract ground truth labels (depth, surface normals, etc) using Blender's in-built render. If you do not need these, remove this config to save on runtime. - - `--pipeline_overrides LocalScheduleHandler.use_gpu=False` tells the system not to look for available GPUs, and to _not_ make them available to any jobs. This is intended only to make the Hello World easier to run, and work on non-NVIDIA systems. Please [click here](#using-gpu-acceleration) for full instructions on using a GPU. + - `--pipeline_overrides LocalScheduleHandler.gpu_type=None` tells the system not to look for available GPUs, and to _not_ make them available to any jobs. This is intended only to make the Hello World easier to run, and work on non-NVIDIA systems. Please [click here](#using-gpu-acceleration) for full instructions on using a GPU. ## Commandline Options in Detail @@ -90,7 +90,7 @@ Infinigen currently only supports NVIDIA GPUs. Infinigen can use a GPU in accele 1. In the `opengl_gt` step (if enabled) our custom ground truth code uses OpenGL and thus requires access to a GPU. To enable these GPU accelerated steps: - - First, if you are using a `local_*.gin` pipeline config, you must first remove `--pipeline_overrides LocalScheduleHandler.use_gpu=False` from our Hello World command, or otherwise ensure this value is set to true via configs/overrides. This will make the GPU _visible_ to each child process, and will cause _rendering_ to automatically detect and use the GPU. `slurm.gin` assumes GPUs will be available by default, set it's GPU request amounts to 0 if this is not the case for your cluster. + - First, if you are using a `local_*.gin` pipeline config, you must first remove `--pipeline_overrides LocalScheduleHandler.gpu_type=None` from our Hello World command, or otherwise ensure this value is set to true via configs/overrides. This will make the GPU _visible_ to each child process, and will cause _rendering_ to automatically detect and use the GPU. `slurm.gin` assumes GPUs will be available by default, set it's GPU request amounts to 0 if this is not the case for your cluster. - To enable GPU-acceleration for `fine_terrain`, you must ensure that `install.sh` was run on a machine with CUDA, then add `cuda_terrain.gin` to your `--pipeline_configs`. - OpenGL GT can be enabled described in [Extended ground-truth](GroundTruthAnnotations.md) diff --git a/docs/HelloWorld.md b/docs/HelloWorld.md index 1c01b45e5..b4cff549e 100644 --- a/docs/HelloWorld.md +++ b/docs/HelloWorld.md @@ -41,7 +41,7 @@ We provide `tools/manage_datagen_jobs.py`, a utility which runs similar steps au ``` python -m tools.manage_datagen_jobs --output_folder outputs/hello_world --num_scenes 1 --specific_seed 0 ---configs desert.gin simple.gin --pipeline_configs local_16GB.gin monocular.gin blender_gt.gin --pipeline_overrides LocalScheduleHandler.use_gpu=False +--configs desert.gin simple.gin --pipeline_configs local_16GB.gin monocular.gin blender_gt.gin --pipeline_overrides LocalScheduleHandler.gpu_type=None ``` This command will repeatedly print summaries of the status of each stage of the pipeline. Please look in `outputs/hello_world/1/logs` for full output logs of the underlying tasks. diff --git a/requirements.txt b/requirements.txt index 156ca419b..2d5b162a4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,3 +27,4 @@ landlab==2.4.1 scikit-learn psutil pyrender +pyadl \ No newline at end of file diff --git a/worldgen/tools/util/submitit_emulator.py b/worldgen/tools/util/submitit_emulator.py index 5612735c3..13520a0fb 100644 --- a/worldgen/tools/util/submitit_emulator.py +++ b/worldgen/tools/util/submitit_emulator.py @@ -24,11 +24,53 @@ import threading import submitit import gin +from pyadl import ADLManager import numpy as np from shutil import which -CUDA_VARNAME = "CUDA_VISIBLE_DEVICES" +GPU_VISIBIITY_ENVVAR_NAMES = { + 'NVIDIA': 'CUDA_VISIBLE_DEVICES', + 'AMD': 'HIP_VISIBLE_DEVICES' +} + +NVIDIA_SMI_PATH = '/bin/nvidia-smi' + +def get_hardware_gpus(gpu_type): + if gpu_type is None: + return {'0'} + elif gpu_type == 'NVIDIA': + if which(NVIDIA_SMI_PATH) is None: + raise ValueError(f'Attempted to use {gpu_type=} but could not find {NVIDIA_SMI_PATH}') + result = subprocess.check_output(f'{NVIDIA_SMI_PATH} -L'.split()).decode() + return set(i for i in range(len(result.splitlines()))) + elif gpu_type == 'AMD': + return set(i for i, _ in enumerate(ADLManager.getInstance().getDevices())) + else: + raise ValueError(f'Unrecognized {gpu_type=}') + +def get_visibile_gpus(gpu_type): + + gpus_uuids = get_hardware_gpus() + + envvar = GPU_VISIBIITY_ENVVAR_NAMES[gpu_type] + if envvar in os.environ: + visible = [int(s.strip()) for s in os.environ[envvar].split(',')] + gpus_uuids = gpus_uuids.intersection(visible) + logging.warning(f"Restricting to {gpus_uuids=} due to toplevel {envvar} setting") + + return gpus_uuids + +def set_gpu_visibility(gpu_type, devices=None): + + if devices is None: + varstr = '' + else: + varstr = ','.join([str(i) for i in devices]) + + envvar = GPU_VISIBIITY_ENVVAR_NAMES.get(gpu_type) + if envvar is not None: + os.environ[envvar] = varstr @dataclass class LocalJob: @@ -67,19 +109,15 @@ def get_fake_job_id(): # Lahav assures me these will never conflict return np.random.randint(int(1e10), int(1e11)) -def job_wrapper(func, inner_args, inner_kwargs, stdout_file: Path, stderr_file: Path, cuda_devices=None): - +def job_wrapper(func, inner_args, inner_kwargs, stdout_file: Path, stderr_file: Path, gpu_devices=None, gpu_type=None): with stdout_file.open('w') as stdout, stderr_file.open('w') as stderr: sys.stdout = stdout sys.stderr = stderr - if cuda_devices is not None: - os.environ[CUDA_VARNAME] = ','.join([str(i) for i in cuda_devices]) - else: - os.environ[CUDA_VARNAME] = '' + set_gpu_visibility(gpu_type, gpu_devices) return func(*inner_args, **inner_kwargs) -def launch_local(func, args, kwargs, job_id, log_folder, name, cuda_devices=None): +def launch_local(func, args, kwargs, job_id, log_folder, name, gpu_devices=None, gpu_type=None): stderr_file = log_folder / f"{job_id}_0_log.err" stdout_file = log_folder / f"{job_id}_0_log.out" @@ -92,7 +130,8 @@ def launch_local(func, args, kwargs, job_id, log_folder, name, cuda_devices=None inner_kwargs=kwargs, stdout_file=stdout_file, stderr_file=stderr_file, - cuda_devices=cuda_devices + gpu_devices=gpu_devices, + gpu_type=gpu_type ) proc = Process(target=job_wrapper, kwargs=kwargs, name=name) proc.start() @@ -126,10 +165,10 @@ def instance(cls): cls._inst = cls() return cls._inst - def __init__(self, jobs_per_gpu=1, use_gpu=True): + def __init__(self, jobs_per_gpu=1, gpu_type='NVIDIA'): self.queue = [] self.jobs_per_gpu = jobs_per_gpu - self.use_gpu = use_gpu + self.gpu_type = gpu_type def enqueue(self, func, args, kwargs, params, log_folder): @@ -149,19 +188,11 @@ def total_resources(self): resources = {} - if self.use_gpu: - if which('/bin/nvidia-smi') is not None: - result = subprocess.check_output('/bin/nvidia-smi -L'.split()).decode() - gpus_uuids = set(i for i in range(len(result.splitlines()))) - - if CUDA_VARNAME in os.environ: - visible = [int(s.strip()) for s in os.environ[CUDA_VARNAME].split(',')] - gpus_uuids = gpus_uuids.intersection(visible) - print(f"Restricting to {gpus_uuids=} due to toplevel {CUDA_VARNAME} setting") - - resources['gpus'] = set(itertools.product(range(len(gpus_uuids)), range(self.jobs_per_gpu))) - else: - resources['gpus'] = {'0'} + if self.gpu_type is not None: + gpu_uuids = get_visibile_gpus(self.gpu_type) + if len(gpu_uuids) == 0: + gpu_uuids = {'0'} + resources['gpus'] = set(itertools.product(range(len(gpu_uuids)), range(self.jobs_per_gpu))) return resources @@ -195,10 +226,14 @@ def dispatch(self, job_rec, resources): gpu_idxs = [g[0] for g in gpu_assignment] job_rec['job'].process = launch_local( - func=job_rec["func"], args=job_rec["args"], kwargs=job_rec["kwargs"], - job_id=job_rec["job"].job_id, log_folder=job_rec["log_folder"], + func=job_rec["func"], + args=job_rec["args"], + kwargs=job_rec["kwargs"], + job_id=job_rec["job"].job_id, + log_folder=job_rec["log_folder"], name=job_rec["params"].get("name", None), - cuda_devices=gpu_idxs + gpu_devices=gpu_idxs, + gpu_type=self.gpu_type ) job_rec['gpu_assignment'] = gpu_assignment @@ -206,7 +241,7 @@ def attempt_dispatch_job(self, job_rec, available, total, select_gpus='first'): n_gpus = job_rec['params'].get('gpus', 0) or 0 - if n_gpus == 0 or not self.use_gpu: + if n_gpus == 0 or self.gpu_type is None: return self.dispatch(job_rec, resources={}) if n_gpus <= len(available['gpus']): From 4fa02935e004f8e6da3fb0c3573a1e08d2931119 Mon Sep 17 00:00:00 2001 From: pvl-bot Date: Tue, 1 Aug 2023 13:48:39 -0400 Subject: [PATCH 2/2] Update infinigen_gpl pointer to amd_support branch --- worldgen/infinigen_gpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/worldgen/infinigen_gpl b/worldgen/infinigen_gpl index 0e11ee0e8..64c1d32fe 160000 --- a/worldgen/infinigen_gpl +++ b/worldgen/infinigen_gpl @@ -1 +1 @@ -Subproject commit 0e11ee0e8a19cfa46d9d0d20832f5089986f5427 +Subproject commit 64c1d32fe254afe10c5978184b5c0f18dfa2a642