Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use environment variable to pass user's command #742

Merged
merged 1 commit into from
Jan 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 3 additions & 22 deletions src/ClusterManager/dist_pod_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,6 @@ def __init__(self, template, enable_custom_scheduler=False, secret_templates=Non
self.enable_custom_scheduler = enable_custom_scheduler
self.secret_templates = secret_templates

@staticmethod
def generate_launch_script(dist_role, dist_role_idx, user_id, job_path, cmd):
# change ssh folder permission here because the setup permission
# script in launch_ps_job function may have race condition with init_user.sh script.
# results in no such user error

local_pod_path = os.path.join(config["storage-mount-path"], "work/", job_path, "{}-{}".format(dist_role, dist_role_idx))
if not os.path.exists(local_pod_path):
mkdirsAsUser(local_pod_path, user_id)
file_name = "job_command.sh"
launch_script_file = os.path.join(local_pod_path, file_name)
with open(launch_script_file, 'w') as f:
f.write(cmd)
f.close()

launchCMD = ["bash", "/pod/scripts/bootstrap.sh"]
return launchCMD

def generate_pod(self, pod):
assert(isinstance(self.template, Template))

Expand All @@ -62,11 +44,10 @@ def generate_pod(self, pod):
pod["labels"].append({"name": "distRole", "value": pod["distRole"]})
pod["labels"].append({"name": "distRoleIdx", "value": pod["distRoleIdx"]})

cmd = pod["cmd"]
pod["LaunchCMD"] = DistPodTemplate.generate_launch_script(pod["distRole"], pod["distRoleIdx"], pod["userId"], job_path, cmd)

pod_yaml = self.template.render(job=pod)
return yaml.full_load(pod_yaml)
pod_obj = yaml.full_load(pod_yaml)
pod_obj["spec"]["containers"][0]["env"].append({"name": "DLWS_LAUNCH_CMD", "value": pod["cmd"]})
return pod_obj

def generate_pods(self, job):
"""
Expand Down
28 changes: 7 additions & 21 deletions src/ClusterManager/pod_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,27 +19,13 @@ def __init__(self, template, deployment_template=None, enable_custom_scheduler=F
self.enable_custom_scheduler = enable_custom_scheduler
self.secret_templates = secret_templates

@staticmethod
def generate_launch_script(job_id, path_to_save, user_id, gpu_num, user_script):
if not os.path.exists(path_to_save):
mkdirsAsUser(path_to_save, user_id)

file_name = "job_command.sh"
launch_script_file = os.path.join(path_to_save, file_name)
with open(launch_script_file, 'w') as f:
f.write(user_script)
os.system("sudo chown %s %s" % (user_id, launch_script_file))
luanch_cmd = ["bash", "/pod/scripts/bootstrap.sh"]
return luanch_cmd


def generate_deployment(self, pod):
assert(isinstance(self.template, Template))
pod_yaml = self.deployment_template.render(job=pod)
return yaml.full_load(pod_yaml)


def generate_pod(self, pod):
def generate_pod(self, pod, cmd):
assert(isinstance(self.template, Template))
if self.enable_custom_scheduler:
if "useGPUTopology" in pod and pod["useGPUTopology"]:
Expand Down Expand Up @@ -69,7 +55,11 @@ def generate_pod(self, pod):
pod["gpuLimit"] = 0

pod_yaml = self.template.render(job=pod)
return yaml.full_load(pod_yaml)
# because user's cmd can be multiple lines, should add after yaml load
pod_obj = yaml.full_load(pod_yaml)
pod_obj["spec"]["containers"][0]["env"].append({"name": "DLWS_LAUNCH_CMD", "value": cmd})

return pod_obj

def generate_pods(self, job):
"""
Expand Down Expand Up @@ -132,9 +122,6 @@ def generate_pods(self, job):

params = enable_cpu_config(params, job.cluster)

local_pod_path = job.get_hostpath(job.job_path, "master")
params["LaunchCMD"] = PodTemplate.generate_launch_script(params["jobId"], local_pod_path, params["userId"], params["resourcegpu"], params["cmd"])

if "envs" not in params:
params["envs"] =[]

Expand Down Expand Up @@ -167,7 +154,6 @@ def generate_pods(self, job):
pod["podName"] = job.job_id
pods.append(pod)


k8s_pods = []
for idx,pod in enumerate(pods):
pod["numps"] = 0
Expand All @@ -185,7 +171,7 @@ def generate_pods(self, job):
pod["mountpoints"].append({"name": "pod", "containerPath": "/pod", "hostPath": pod_path, "enabled": True})
pod["init-container"] = os.environ["INIT_CONTAINER_IMAGE"]

k8s_pod = self.generate_pod(pod)
k8s_pod = self.generate_pod(pod, params["cmd"])
k8s_pods.append(k8s_pod)

if params["jobtrainingtype"] == "InferenceJob":
Expand Down
2 changes: 1 addition & 1 deletion src/Jobs_Templete/pod.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ spec:
- name: {{ job["podName"] }}
image: {{ job["image"] }}
imagePullPolicy: Always
command: {{ job["LaunchCMD"] }}
command: ["bash", "/pod/scripts/bootstrap.sh"]
readinessProbe:
exec:
command: ["ls", "/pod/running/ROLE_READY"]
Expand Down
1 change: 1 addition & 0 deletions src/init-scripts/bootstrap.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ if [ "$DLWS_ROLE_NAME" = "worker" ];
then
runuser -l ${DLWS_USER_NAME} -c "sleep infinity"
else
printenv DLWS_LAUNCH_CMD > /pod/job_command.sh
chmod +x /pod/job_command.sh
runuser -l ${DLWS_USER_NAME} -c /pod/job_command.sh
# Save exit code
Expand Down