Skip to content

Commit

Permalink
use environment variable to pass user's command (#742)
Browse files Browse the repository at this point in the history
  • Loading branch information
xudifsd authored Jan 8, 2020
1 parent 22615af commit f7af134
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 44 deletions.
25 changes: 3 additions & 22 deletions src/ClusterManager/dist_pod_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,6 @@ def __init__(self, template, enable_custom_scheduler=False, secret_templates=Non
self.enable_custom_scheduler = enable_custom_scheduler
self.secret_templates = secret_templates

@staticmethod
def generate_launch_script(dist_role, dist_role_idx, user_id, job_path, cmd):
# change ssh folder permission here because the setup permission
# script in launch_ps_job function may have race condition with init_user.sh script.
# results in no such user error

local_pod_path = os.path.join(config["storage-mount-path"], "work/", job_path, "{}-{}".format(dist_role, dist_role_idx))
if not os.path.exists(local_pod_path):
mkdirsAsUser(local_pod_path, user_id)
file_name = "job_command.sh"
launch_script_file = os.path.join(local_pod_path, file_name)
with open(launch_script_file, 'w') as f:
f.write(cmd)
f.close()

launchCMD = ["bash", "/pod/scripts/bootstrap.sh"]
return launchCMD

def generate_pod(self, pod):
assert(isinstance(self.template, Template))

Expand All @@ -62,11 +44,10 @@ def generate_pod(self, pod):
pod["labels"].append({"name": "distRole", "value": pod["distRole"]})
pod["labels"].append({"name": "distRoleIdx", "value": pod["distRoleIdx"]})

cmd = pod["cmd"]
pod["LaunchCMD"] = DistPodTemplate.generate_launch_script(pod["distRole"], pod["distRoleIdx"], pod["userId"], job_path, cmd)

pod_yaml = self.template.render(job=pod)
return yaml.full_load(pod_yaml)
pod_obj = yaml.full_load(pod_yaml)
pod_obj["spec"]["containers"][0]["env"].append({"name": "DLWS_LAUNCH_CMD", "value": pod["cmd"]})
return pod_obj

def generate_pods(self, job):
"""
Expand Down
28 changes: 7 additions & 21 deletions src/ClusterManager/pod_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,27 +19,13 @@ def __init__(self, template, deployment_template=None, enable_custom_scheduler=F
self.enable_custom_scheduler = enable_custom_scheduler
self.secret_templates = secret_templates

@staticmethod
def generate_launch_script(job_id, path_to_save, user_id, gpu_num, user_script):
if not os.path.exists(path_to_save):
mkdirsAsUser(path_to_save, user_id)

file_name = "job_command.sh"
launch_script_file = os.path.join(path_to_save, file_name)
with open(launch_script_file, 'w') as f:
f.write(user_script)
os.system("sudo chown %s %s" % (user_id, launch_script_file))
luanch_cmd = ["bash", "/pod/scripts/bootstrap.sh"]
return luanch_cmd


def generate_deployment(self, pod):
assert(isinstance(self.template, Template))
pod_yaml = self.deployment_template.render(job=pod)
return yaml.full_load(pod_yaml)


def generate_pod(self, pod):
def generate_pod(self, pod, cmd):
assert(isinstance(self.template, Template))
if self.enable_custom_scheduler:
if "useGPUTopology" in pod and pod["useGPUTopology"]:
Expand Down Expand Up @@ -69,7 +55,11 @@ def generate_pod(self, pod):
pod["gpuLimit"] = 0

pod_yaml = self.template.render(job=pod)
return yaml.full_load(pod_yaml)
# because user's cmd can be multiple lines, should add after yaml load
pod_obj = yaml.full_load(pod_yaml)
pod_obj["spec"]["containers"][0]["env"].append({"name": "DLWS_LAUNCH_CMD", "value": cmd})

return pod_obj

def generate_pods(self, job):
"""
Expand Down Expand Up @@ -132,9 +122,6 @@ def generate_pods(self, job):

params = enable_cpu_config(params, job.cluster)

local_pod_path = job.get_hostpath(job.job_path, "master")
params["LaunchCMD"] = PodTemplate.generate_launch_script(params["jobId"], local_pod_path, params["userId"], params["resourcegpu"], params["cmd"])

if "envs" not in params:
params["envs"] =[]

Expand Down Expand Up @@ -167,7 +154,6 @@ def generate_pods(self, job):
pod["podName"] = job.job_id
pods.append(pod)


k8s_pods = []
for idx,pod in enumerate(pods):
pod["numps"] = 0
Expand All @@ -185,7 +171,7 @@ def generate_pods(self, job):
pod["mountpoints"].append({"name": "pod", "containerPath": "/pod", "hostPath": pod_path, "enabled": True})
pod["init-container"] = os.environ["INIT_CONTAINER_IMAGE"]

k8s_pod = self.generate_pod(pod)
k8s_pod = self.generate_pod(pod, params["cmd"])
k8s_pods.append(k8s_pod)

if params["jobtrainingtype"] == "InferenceJob":
Expand Down
2 changes: 1 addition & 1 deletion src/Jobs_Templete/pod.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ spec:
- name: {{ job["podName"] }}
image: {{ job["image"] }}
imagePullPolicy: Always
command: {{ job["LaunchCMD"] }}
command: ["bash", "/pod/scripts/bootstrap.sh"]
readinessProbe:
exec:
command: ["ls", "/pod/running/ROLE_READY"]
Expand Down
1 change: 1 addition & 0 deletions src/init-scripts/bootstrap.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ if [ "$DLWS_ROLE_NAME" = "worker" ];
then
runuser -l ${DLWS_USER_NAME} -c "sleep infinity"
else
printenv DLWS_LAUNCH_CMD > /pod/job_command.sh
chmod +x /pod/job_command.sh
runuser -l ${DLWS_USER_NAME} -c /pod/job_command.sh
# Save exit code
Expand Down

0 comments on commit f7af134

Please sign in to comment.