Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sunspot #300

Merged
merged 16 commits into from
Dec 14, 2022
21 changes: 21 additions & 0 deletions balsam/config/defaults/alcf_sunspot/job-template.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash
#PBS -l select={{ num_nodes }}:system=sunspot,place=scatter
#PBS -l walltime={{ wall_time_min//60 | int }}:{{ wall_time_min | int }}:00
#PBS -l filesystems=home
#PBS -A {{ project }}
#PBS -q {{ queue }}

export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
export http_proxy=http://proxy.alcf.anl.gov:3128
export https_proxy=http://proxy.alcf.anl.gov:3128

#remove export PMI_NO_FORK=1
export BALSAM_SITE_PATH={{balsam_site_path}}
cd $BALSAM_SITE_PATH

echo "Starting balsam launcher at $(date)"
{{launcher_cmd}} -j {{job_mode}} -t {{wall_time_min - 2}} \
{% for k, v in filter_tags.items() %} --tag {{k}}={{v}} {% endfor %} \
{{partitions}}
echo "Balsam launcher done at $(date)"
21 changes: 21 additions & 0 deletions balsam/config/defaults/alcf_sunspot/settings.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
title: "Sunspot (ALCF)"

compute_node: balsam.platform.compute_node.SunspotNode
mpi_app_launcher: balsam.platform.app_run.SunspotRun
local_app_launcher: balsam.platform.app_run.LocalAppRun
mpirun_allows_node_packing: true

serial_mode_startup_params:
cpu_affinity: none

scheduler_class: balsam.platform.scheduler.PBSScheduler
allowed_queues:
workq:
max_nodes: 128
max_queued_jobs: 100
max_walltime: 60

allowed_projects:
- Aurora_deployment

optional_batch_job_params: {}
2 changes: 2 additions & 0 deletions balsam/platform/app_run/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .polaris import PolarisRun
from .slurm import SlurmRun
from .summit import SummitJsrun
from .sunspot import SunspotRun
from .theta import ThetaAprun
from .theta_gpu import ThetaGPURun

Expand All @@ -18,5 +19,6 @@
"ThetaGPURun",
"MPICHRun",
"SummitJsrun",
"SunspotRun",
"PerlmutterGPURun",
]
2 changes: 1 addition & 1 deletion balsam/platform/app_run/app_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def start(self) -> None:
self._set_envs()
cmdline = self._build_preamble() + self._build_cmdline()
logger.info(f"{self.__class__.__name__} Popen: {cmdline}")
log_envs = ["OMP_NUM_THREADS", "OMP_PLACES"]
log_envs = ["OMP_NUM_THREADS", "OMP_PLACES", "OMP_PROC_BIND"]
for k in log_envs:
if k in self._envs.keys():
logger.info(f"{self.__class__.__name__} envs: {k}={self._envs[k]}")
Expand Down
44 changes: 44 additions & 0 deletions balsam/platform/app_run/sunspot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os

from .app_run import SubprocessAppRun


class SunspotRun(SubprocessAppRun):
"""
https://www.open-mpi.org/doc/v3.0/man1/mpiexec.1.php
"""

def _build_cmdline(self) -> str:
node_ids = [h for h in self._node_spec.hostnames]
cpu_bind = self._launch_params.get("cpu_bind", "none")
nid_str = ",".join(map(str, node_ids))
args = [
"mpiexec",
"-np",
self.get_num_ranks(),
"-ppn",
self._ranks_per_node,
"--hosts",
nid_str,
"--cpu-bind",
cpu_bind,
"-d",
self._threads_per_rank,
"--envall",
self._cmdline,
]
return " ".join(str(arg) for arg in args)

# Overide default because sunspot does not use CUDA
def _set_envs(self) -> None:
envs = os.environ.copy()
envs.update(self._envs)
envs["OMP_NUM_THREADS"] = str(self._threads_per_rank)

# Check the assigned GPU ID list from the first compute node:
gpu_ids = self._node_spec.gpu_ids[0]
if gpu_ids:
envs["ZE_ENABLE_PCI_ID_DEVICE_ORDER"] = "1"
envs["ZE_AFFINITY_MASK"] = ",".join(map(str, gpu_ids))

self._envs = envs
2 changes: 2 additions & 0 deletions balsam/platform/compute_node/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .alcf_cooley_node import CooleyNode
from .alcf_polaris_node import PolarisNode
from .alcf_sunspot_node import SunspotNode
from .alcf_thetagpu_node import ThetaGPUNode
from .alcf_thetaknl_node import ThetaKNLNode
from .compute_node import ComputeNode
Expand All @@ -19,5 +20,6 @@
"CoriKNLNode",
"PerlmutterGPUNode",
"PolarisNode",
"SunspotNode",
"ComputeNode",
]
55 changes: 55 additions & 0 deletions balsam/platform/compute_node/alcf_sunspot_node.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import logging
import os
from typing import List, Optional, Union

from .compute_node import ComputeNode

logger = logging.getLogger(__name__)
IntStr = Union[int, str]


class SunspotNode(ComputeNode):

cpu_ids = list(range(104))
gpu_ids: List[IntStr]

gpu_ids = []
for gid in range(6):
for tid in range(2):
gpu_ids.append(str(gid) + "." + str(tid))

@classmethod
def get_job_nodelist(cls) -> List["SunspotNode"]:
"""
Get all compute nodes allocated in the current job context
"""
nodefile = os.environ["PBS_NODEFILE"]
# a file containing a list of node hostnames, one per line
# thetagpu01
# thetagpu02
with open(nodefile) as fp:
data = fp.read()
splitter = "," if "," in data else None
hostnames = data.split(splitter)
hostnames = [h.strip() for h in hostnames if h.strip()]
node_ids: Union[List[str], List[int]]
node_ids = hostnames[:]
node_list = []
for nid, hostname in zip(node_ids, hostnames):
gpu_ids = cls.discover_gpu_list(hostname)
assert isinstance(nid, str) or isinstance(nid, int)
node_list.append(cls(nid, hostname, gpu_ids=gpu_ids))
return node_list

@classmethod
def discover_gpu_list(cls, hostname: str) -> List[IntStr]:
gpu_ids = cls.gpu_ids
logger.info(f"{hostname} detected GPU IDs: {gpu_ids}")
return gpu_ids

@staticmethod
def get_scheduler_id() -> Optional[int]:
id = os.environ.get("PBS_JOBID")
if id is not None:
return int(id.split(".")[0])
return None
32 changes: 17 additions & 15 deletions balsam/platform/scheduler/pbs_sched.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
import json
import logging
import os
import subprocess
import tempfile
from collections import Counter, defaultdict
from datetime import datetime
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union

import click
import dateutil.parser

from balsam.util import parse_to_utc
Expand Down Expand Up @@ -328,19 +324,25 @@ def discover_projects(cls) -> List[str]:
Note: Could use sbank; currently uses Cobalt reporting of valid
projects when an invalid project is given
"""
click.echo("Checking with sbank for your current allocations...")
""" click.echo("Checking with sbank for your current allocations...")
with tempfile.NamedTemporaryFile() as fp:
os.chmod(fp.name, 0o777)
proc = subprocess.run(
"sbank projects -r polaris -f project_name --no-header --no-totals --no-sys-msg",
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
encoding="utf-8",
)

sbank_out = proc.stdout
projects = [p.strip() for p in sbank_out.split("\n") if p]
try:
proc = subprocess.run(
"sbank projects -r polaris -f project_name --no-header --no-totals --no-sys-msg",
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
encoding="utf-8",
)
print(f"proc is {proc}")
sbank_out = proc.stdout
projects = [p.strip() for p in sbank_out.split("\n") if p]
except:
projects = None

"""
projects = None
if not projects:
projects = super().discover_projects()
return projects
Expand Down