Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix breaks of the deployment pipeline for on premise machines and updated kernel #639

Merged
merged 13 commits into from
Nov 8, 2019
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Add steps that build, run tests, deploy, and more:
# https://aka.ms/yaml
trigger:
- citest
- dltsdev

pool:
name: 'DLTS-Platform'
Expand Down
51 changes: 38 additions & 13 deletions src/ClusterBootstrap/az_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,7 @@ def create_nfs_nsg():
print(output)

print type(config["cloud_config"]["nfs_ssh"]["source_ips"]), config["cloud_config"]["nfs_ssh"]["source_ips"],type(source_addresses_prefixes), source_addresses_prefixes
merged_ip = utils.keep_widest_subnet(config["cloud_config"]["nfs_ssh"]["source_ips"] + source_addresses_prefixes)
cmd = """
az network nsg rule create \
--resource-group %s \
Expand All @@ -389,8 +390,10 @@ def create_nfs_nsg():
""" % ( config["azure_cluster"]["resource_group_name"],
config["azure_cluster"]["nfs_nsg_name"],
config["cloud_config"]["nfs_ssh"]["port"],
" ".join(list(set(config["cloud_config"]["nfs_ssh"]["source_ips"] + source_addresses_prefixes))),
" ".join(merged_ip),
)
if verbose:
print(cmd)
if not no_execution:
output = utils.exec_cmd_local(cmd)
print(output)
Expand Down Expand Up @@ -489,6 +492,26 @@ def create_cluster(arm_vm_password=None, parallelism=1):
create_vm_param(i, "nfs", config["azure_cluster"]["nfs_vm_size"], False,
arm_vm_password, config["azure_cluster"]["nfs_vm"][i] if i < len(config["azure_cluster"]["nfs_vm"]) else None )

def add_workers(arm_vm_password=None, parallelism=1):
# assert config["priority"] == "regular" and "vmss cloudinit not supported yet"
if config["priority"] == "regular":
print("entering")
YinYangOfDao marked this conversation as resolved.
Show resolved Hide resolved
if parallelism > 1:
# TODO: Tolerate faults
from multiprocessing import Pool
args_list = [(i, "worker", config["azure_cluster"]["worker_vm_size"], arm_vm_password is not None, arm_vm_password)
for i in range(int(config["azure_cluster"]["worker_node_num"]))]
pool = Pool(processes=parallelism)
pool.map(create_vm_param_wrapper, args_list)
pool.close()
else:
for i in range(int(config["azure_cluster"]["worker_node_num"])):
create_vm_param(i, "worker", config["azure_cluster"]["worker_vm_size"],
arm_vm_password is not None, arm_vm_password)
elif config["priority"] == "low":
utils.render_template("./template/vmss/vmss.sh.template", "scripts/vmss.sh",config)
utils.exec_cmd_local("chmod +x scripts/vmss.sh;./scripts/vmss.sh")

def create_vm_param_wrapper(arg_tuple):
i, role, vm_size, no_az, arm_vm_password = arg_tuple
return create_vm_param(i, role, vm_size, no_az, arm_vm_password)
Expand Down Expand Up @@ -666,7 +689,7 @@ def get_disk_from_vm(vmname):
def gen_cluster_config(output_file_name, output_file=True, no_az=False):
if config["priority"] == "low":
utils.render_template("./template/dns/cname_and_private_ips.sh.template", "scripts/cname_and_ips.sh", config)
utils.exec_cmd_local("chmod +x scripts/cname_and_ips.sh")
utils.exec_cmd_local("chmod +x scripts/cname_and_ips.sh;bash scripts/cname_and_ips.sh")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a space after ;

print "\nPlease copy the commands in dns_add_commands and register the DNS records on http://servicebook/dns/self-service.html\n"
bSQLOnly = (config["azure_cluster"]["infra_node_num"] <= 0)
if useAzureFileshare() and not no_az:
Expand Down Expand Up @@ -718,14 +741,13 @@ def gen_cluster_config(output_file_name, output_file=True, no_az=False):
cc["deploydockerETCD"] = False
cc["platform-scripts"] = "ubuntu"
cc["basic_auth"] = "%s,admin,1000" % uuid.uuid4().hex[:16]
domain_mapping = {"regular":"%s.cloudapp.azure.com" % config["azure_cluster"]["azure_location"], "low": config["domain_name"]}
domain_mapping = {"regular":"%s.cloudapp.azure.com" % config["azure_cluster"]["azure_location"], "low": "redmond.corp.microsoft.com"}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we use hard coded "redmond.corp.microsoft.com" ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is an error happened when I was cherrypicking my changes

if not bSQLOnly:
cc["network"] = {"domain": domain_mapping[config["priority"]]}

cc["machines"] = {}
for i in range(int(config["azure_cluster"]["infra_node_num"])):
vmname = "%s-infra%02d" % (config["azure_cluster"]
["cluster_name"].lower(), i + 1)
vmname = "{}-infra{:02d}".format(config["azure_cluster"]["cluster_name"], i + 1).lower()
cc["machines"][vmname] = {"role": "infrastructure", "private-ip": get_vm_ip(i, "infra")}

# Generate the workers in machines.
Expand All @@ -747,7 +769,7 @@ def gen_cluster_config(output_file_name, output_file=True, no_az=False):
for l in rf:
worker_machines += l.split()[0],
for vmname in worker_machines:
cc["machines"][vmname] = {"role": "worker","node-group": config["azure_cluster"]["worker_vm_size"],
cc["machines"][vmname.lower()] = {"role": "worker","node-group": config["azure_cluster"]["worker_vm_size"],
"gpu-type":sku_mapping[config["azure_cluster"]["worker_vm_size"]]["gpu-type"]}
elif config["priority"] == "regular":
for vm in vm_list:
Expand All @@ -756,18 +778,18 @@ def gen_cluster_config(output_file_name, output_file=True, no_az=False):
worker_machines += vmname,
for vmname in worker_machines:
if isNewlyScaledMachine(vmname):
cc["machines"][vmname] = {
cc["machines"][vmname.lower()] = {
"role": "worker", "scaled": True,
"node-group": vm["vmSize"],"gpu-type":sku_mapping.get(vm["vmSize"],sku_mapping["default"])["gpu-type"]}
else:
cc["machines"][vmname] = {
cc["machines"][vmname.lower()] = {
"role": "worker",
"node-group": vm["vmSize"],"gpu-type":sku_mapping.get(vm["vmSize"],sku_mapping["default"])["gpu-type"]}
nfs_nodes = []
for vm in vm_list:
vmname = vm["name"]
if "-nfs" in vmname:
cc["machines"][vmname] = {
cc["machines"][vmname.lower()] = {
"role": "nfs",
"node-group": vm["vmSize"]}

Expand Down Expand Up @@ -924,6 +946,9 @@ def run_command(args, command, nargs, parser):
create_cluster(args.arm_password, args.parallelism)
vm_interconnects()

elif command == "addworkers":
add_workers(args.arm_password, args.parallelism)
vm_interconnects()
elif command == "list":
list_vm()

Expand Down Expand Up @@ -1089,15 +1114,15 @@ def run_command(args, command, nargs, parser):
if os.path.exists(config_file):
with open(config_file) as cf:
tmpconfig = yaml.load(cf)
assert tmpconfig["cluster_name"] in tmpconfig["azure_cluster"]
# assert tmpconfig["cluster_name"] in tmpconfig["azure_cluster"]
merge_config(config, tmpconfig, verbose)
if tmpconfig is not None and "cluster_name" in tmpconfig:
config["azure_cluster"]["cluster_name"] = tmpconfig["cluster_name"]
if tmpconfig is not None and "datasource" in tmpconfig:
config["azure_cluster"]["datasource"] = tmpconfig["datasource"]
if tmpconfig is not None and "azure_cluster" in tmpconfig and config["azure_cluster"]["cluster_name"] in tmpconfig["azure_cluster"]:
merge_config(config["azure_cluster"], tmpconfig["azure_cluster"][
config["azure_cluster"]["cluster_name"]], verbose)
# if tmpconfig is not None and "azure_cluster" in tmpconfig and config["azure_cluster"]["cluster_name"] in tmpconfig["azure_cluster"]:
# merge_config(config["azure_cluster"], tmpconfig["azure_cluster"][
# config["azure_cluster"]["cluster_name"]], verbose)
if (args.cluster_name is not None):
config["azure_cluster"]["cluster_name"] = args.cluster_name

Expand Down
8 changes: 6 additions & 2 deletions src/ClusterBootstrap/bash_step_by_step_deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,22 @@
./deploy.py genscripts
./deploy.py runscriptonroles infra worker ./scripts/dns.sh
./deploy.py -y deploy
./deploy.py -y updateworker
./deploy.py -y updateworkerinparallel
./deploy.py -y kubernetes labels
./deploy.py -y gpulabel
./deploy.py kubernetes start nvidia-device-plugin
./deploy.py kubernetes start flexvolume
./deploy.py webui
./deploy.py docker push restfulapi
./deploy.py docker push webui
./deploy.py docker push watchdog
./deploy.py docker push gpu-reporter
./deploy.py docker push reaper
./deploy.py docker push job-exporter
./deploy.py mount
./deploy.py kubernetes start mysql
./deploy.py kubernetes start jobmanager
./deploy.py kubernetes start restfulapi
./deploy.py kubernetes start webportal
./deploy.py --sudo runscriptonrandmaster ./scripts/pass_secret.sh
./deploy.py runscriptonroles worker scripts/pre_download_images.sh
./deploy.py runscriptonroles worker scripts/pre_download_images.sh
92 changes: 67 additions & 25 deletions src/ClusterBootstrap/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,7 @@ def get_domain():
domain = ""
return domain

# Get a list of nodes from cluster.yaml
# Get a list of nodes DNS from cluster.yaml
def get_nodes_from_config(machinerole):
machinerole = "infrastructure" if machinerole == "infra" else machinerole
if "machines" not in config:
Expand Down Expand Up @@ -572,6 +572,12 @@ def get_ETCD_master_nodes_from_cluster_portal(clusterId):

def get_ETCD_master_nodes_from_config(clusterId):
Nodes = get_nodes_from_config("infrastructure")
if int(config["etcd_node_num"]) == 1:
for nodename in config["machines"]:
nodeInfo = config["machines"][nodename]
if "role" in nodeInfo and nodeInfo["role"]=="infrastructure":
config["etcd_private_ip"] = nodeInfo["private-ip"]
hongzhili marked this conversation as resolved.
Show resolved Hide resolved
break
config["etcd_node"] = Nodes
config["kubernetes_master_node"] = Nodes
return Nodes
Expand Down Expand Up @@ -705,8 +711,8 @@ def GetCertificateProperty():
masterdns.append(value)

config["apiserver_ssl_dns"] = "\n".join(["DNS."+str(i+5)+" = "+dns for i,dns in enumerate(masterdns)])
config["apiserver_ssl_ip"] = "IP.1 = "+config["api-server-ip"]+"\nIP.2 = 127.0.0.1\n"+ "\n".join(["IP."+str(i+3)+" = "+ip for i,ip in enumerate(masterips)])

config["apiserver_ssl_ip"] = "\n".join(["IP.{} = {}".format(i, sslip) for i, sslip in enumerate([config["api-server-ip"]] + config["ssl_localhost_ips"] + masterips)])
# config["apiserver_ssl_ip"] = "IP.1 = "+config["api-server-ip"]+"\nIP.2 = 127.0.0.1\n"+ "\n".join(["IP."+str(i+3)+" = "+ip for i,ip in enumerate(masterips)])

# kube-apiserver aggregator use easyrsa to generate crt files, we need to generate a group of master names for it.
# It does not care if it's a DNS name or IP.
Expand All @@ -725,7 +731,8 @@ def GetCertificateProperty():
etcddns.append(value)

config["etcd_ssl_dns"] = "\n".join(["DNS."+str(i+5)+" = "+dns for i,dns in enumerate(etcddns)])
config["etcd_ssl_ip"] = "IP.1 = 127.0.0.1\n" + "\n".join(["IP."+str(i+2)+" = "+ip for i,ip in enumerate(etcdips)])
config["etcd_ssl_ip"] = "\n".join(["IP.{} = {}".format(i, sslip) for i, sslip in enumerate(config["ssl_localhost_ips"] + etcdips)])
# config["etcd_ssl_ip"] = "IP.1 = 127.0.0.1\n" + "\n".join(["IP."+str(i+2)+" = "+ip for i,ip in enumerate(etcdips)])

def gen_worker_certificates():

Expand All @@ -747,7 +754,39 @@ def gen_ETCD_certificates():
utils.render_template_directory("./template/ssl", "./deploy/ssl",config)
os.system("cd ./deploy/ssl && bash ./gencerts_etcd.sh")


def load_az_params_as_default():
from az_params import default_az_parameters
# need az_params default, in case we don't have the key in config.yaml
default_cfg = { k: v for k, v in default_az_parameters.items() }
azure_cluster_cfg = { k: v for k, v in config["azure_cluster"].items() } if "azure_cluster" in config else {}
merge_config(config["azure_cluster"], default_cfg["azure_cluster"])
merge_config(config["azure_cluster"], azure_cluster_cfg)
# print config["azure_cluster"], config["network_domain"]

def on_premise_params():
print("Warning: remember to set parameters:\ngpu_count_per_node, gpu_type, worker_node_num\n when using on premise machine!")

def load_platform_type():
platform_type = list(set(config.keys()) & set(config["supported_platform"]))
assert len(platform_type) == 1 and "platform type should be specified explicitly and unique!"
platform_type = platform_type[0]
config["platform_type"] = platform_type

def gen_platform_wise_config():
load_platform_type()
azdefault = { 'network_domain':"config['network']['domain']",
'worker_node_num':"config['azure_cluster']['worker_node_num']",
'gpu_count_per_node':'config["sku_mapping"].get(config["azure_cluster"]["worker_vm_size"],config["sku_mapping"]["default"])["gpu-count"]',
'gpu_type':'config["sku_mapping"].get(config["azure_cluster"]["worker_vm_size"],config["sku_mapping"]["default"])["gpu-type"]' }
on_premise_default = {'network_domain':"config['network']['domain']"}
platform_dict = { 'azure_cluster': azdefault, 'onpremise': on_premise_default }
platform_func = { 'azure_cluster': load_az_params_as_default, 'onpremise': on_premise_params }
default_dict, default_func = platform_dict[config["platform_type"]], platform_func[config["platform_type"]]
default_func()
need_val = ['network_domain', 'worker_node_num', 'gpu_count_per_node', 'gpu_type']
for ky in need_val:
if ky not in config:
config[ky] = eval(default_dict[ky])

def gen_configs():
print "==============================================="
Expand Down Expand Up @@ -805,6 +844,7 @@ def gen_configs():
add_ssh_key()

check_config(config)
gen_platform_wise_config()

utils.render_template_directory("./template/etcd", "./deploy/etcd",config)
utils.render_template_directory("./template/master", "./deploy/master",config)
Expand Down Expand Up @@ -950,32 +990,32 @@ def deploy_masters(force = False):
deploycmd = """
until curl -q http://127.0.0.1:8080/version/ ; do
sleep 5;
echo 'waiting for master...';
echo 'waiting for master kubernetes service...';
done;

until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/weave.yaml --validate=false ; do
sleep 5;
echo 'waiting for master...';
echo 'waiting for master kube-addons weave...';
done ;

until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/dashboard.yaml --validate=false ; do
sleep 5;
echo 'waiting for master...';
echo 'waiting for master kube-addons dashboard...';
done ;

until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/dns-addon.yml --validate=false ; do
sleep 5;
echo 'waiting for master...';
echo 'waiting for master kube-addons dns-addon...';
done ;

until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/kube-proxy.json --validate=false ; do
sleep 5;
echo 'waiting for master...';
echo 'waiting for master kube-addons kube-proxy.json...';
done ;

until sudo /opt/bin/kubectl create -f /etc/kubernetes/clusterroles/ ; do
sleep 5;
echo 'waiting for master...';
echo 'waiting for master kubernetes clusterroles...';
done ;
sudo ln -s /opt/bin/kubectl /usr/bin/;
"""
Expand Down Expand Up @@ -1047,7 +1087,7 @@ def deploy_ETCD_docker():


def deploy_ETCD():

# this condition would not be satisfied at least when deploying new clusters
if "deploydockerETCD" in config and config["deploydockerETCD"]:
deploy_ETCD_docker()
return
Expand Down Expand Up @@ -1110,17 +1150,18 @@ def set_nfs_disk():
"""
we assume there's only 1 cluster.
"""
load_platform_type()
etcd_server_user = config["nfs_user"]
nfs_servers = config["nfs_node"] if len(config["nfs_node"]) > 0 else config["etcd_node"]
machine_name_2_full = {nm.split('.')[0]:nm for nm in nfs_servers}
for srvr_nm, nfs_cnf in config["nfs_disk_mnt"].items():
nfs_cnf["cloud_config"] = {"vnet_range":config["cloud_config"]["vnet_range"], "samba_range": config["cloud_config"]["samba_range"]}
nfs_cnf["nfs_client_CIDR_ranges"] = config["nfs_client_CIDR"]["node_range"]+config["nfs_client_CIDR"]["samba_range"]
nfs_cnf["platform_type"] = config["platform_type"]
nfs_server = machine_name_2_full[srvr_nm]
# print nfs_cnf, nfs_server
utils.render_template("./template/nfs/nfs_config.sh.template","./deploy/scripts/setup_nfs_server.sh",nfs_cnf)
# os.system("cat ./deploy/scripts/setup_nfs_server.sh")
utils.render_template("./template/nfs/nfs_config.sh.template","./scripts/setup_nfs_server.sh",nfs_cnf)
# os.system("cat ./scripts/setup_nfs_server.sh")
# print("------------------>nfs_server<------------------------"+nfs_server)
utils.SSH_exec_script( config["ssh_cert"], etcd_server_user, nfs_server, "./deploy/scripts/setup_nfs_server.sh")
utils.SSH_exec_script( config["ssh_cert"], etcd_server_user, nfs_server, "./scripts/setup_nfs_server.sh")

def create_ISO():
imagename = "./deploy/iso/dlworkspace-cluster-deploy-"+config["cluster_name"]+".iso"
Expand Down Expand Up @@ -2865,7 +2906,7 @@ def kubernetes_label_nodes( verb, servicelists, force ):
# Label kubernete nodes with gpu types.skip for CPU workers
def kubernetes_label_GpuTypes():
for nodename,nodeInfo in config["machines"].items():
if nodeInfo["role"] == "worker" and nodeInfo["gpu-type"] != "NULL":
if nodeInfo["role"] == "worker":
kubernetes_label_node("--overwrite", nodename, "gpuType="+nodeInfo["gpu-type"])


Expand Down Expand Up @@ -3527,7 +3568,7 @@ def run_command( args, command, nargs, parser ):
break
nodes = get_nodes_by_roles(nodeset)
# print(nodes)
run_script_on_all(nodes, nargs[scripts_start:], sudo = args.sudo )
run_script_on_all_in_parallel(nodes, nargs[scripts_start:], sudo = args.sudo )

elif command == "runscriptonrandmaster" and len(nargs)>=1:
run_script_on_rand_master(nargs, args)
Expand Down Expand Up @@ -3737,6 +3778,7 @@ def run_command( args, command, nargs, parser ):
kubernetes_label_GpuTypes()

elif command == "genscripts":
gen_platform_wise_config()
gen_dns_config_script()
gen_pass_secret_script()
gen_warm_up_cluster_script()
Expand Down Expand Up @@ -3929,32 +3971,32 @@ def upgrade_masters(hypekube_url="gcr.io/google-containers/hyperkube:v1.15.2"):
deploy_cmd = """
until curl -q http://127.0.0.1:8080/version/ ; do
sleep 5;
echo 'waiting for master...';
echo 'waiting for master kubernetes service...';
done;

until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/weave.yaml --validate=false ; do
sleep 5;
echo 'waiting for master...';
echo 'waiting for master kube-addons weave...';
done ;

until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/dashboard.yaml --validate=false ; do
sleep 5;
echo 'waiting for master...';
echo 'waiting for master kube-addons dashboard...';
done ;

until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/dns-addon.yml --validate=false ; do
sleep 5;
echo 'waiting for master...';
echo 'waiting for master kube-addons dns-addon...';
done ;

until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/kube-proxy.json --validate=false ; do
sleep 5;
echo 'waiting for master...';
echo 'waiting for master kube-addons kube-proxy...';
done ;

until sudo /opt/bin/kubectl apply -f /etc/kubernetes/clusterroles/ ; do
sleep 5;
echo 'waiting for master...';
echo 'waiting for master kubernetes clusterroles...';
done ;
"""
utils.SSH_exec_cmd(config["ssh_cert"], kubernetes_master_user, kubernetes_masters[0], deploy_cmd , False)
Expand Down
Loading