diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index c6426c750..d822e53be 100755
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -3,7 +3,7 @@
 # Add steps that build, run tests, deploy, and more:
 # https://aka.ms/yaml
 trigger:
-- citest
+- dltsdev
 
 pool:
   name: 'DLTS-Platform'
@@ -24,7 +24,7 @@ steps:
     echo $CONFIG_TYPE
     ./set_config.sh $CONFIG_TYPE
     cd ..
-    ./bash_step_by_step_deploy.sh
+    ./step_by_step.sh azure
   displayName: 'Deploy DLWorkspace'
 
 - script: |
diff --git a/src/ClusterBootstrap/az_params.py b/src/ClusterBootstrap/az_params.py
index d9c50dd7e..6e37a8227 100755
--- a/src/ClusterBootstrap/az_params.py
+++ b/src/ClusterBootstrap/az_params.py
@@ -17,5 +17,8 @@
         "nfs_data_disk_num": 1,
         "nfs_data_disk_path": '/data',
         "nfs_vm": [],
+        "eviction_policy": "Deallocate",
+        "single_placement_group": "false",
+        "default_low_priority_domain": "redmond.corp.microsoft.com",
     },
 }
diff --git a/src/ClusterBootstrap/az_tools.py b/src/ClusterBootstrap/az_tools.py
index 7dda34332..da6617aa1 100755
--- a/src/ClusterBootstrap/az_tools.py
+++ b/src/ClusterBootstrap/az_tools.py
@@ -45,8 +45,6 @@ def init_config():
         config[k] = v
     for k, v in default_az_parameters.iteritems():
         config[k] = v
-    # print config
-    # exit()
     return config
 
 
@@ -377,6 +375,7 @@ def create_nfs_nsg():
             print(output)
 
     print type(config["cloud_config"]["nfs_ssh"]["source_ips"]), config["cloud_config"]["nfs_ssh"]["source_ips"],type(source_addresses_prefixes), source_addresses_prefixes
+    merged_ip = utils.keep_widest_subnet(config["cloud_config"]["nfs_ssh"]["source_ips"] + source_addresses_prefixes)
     cmd = """
         az network nsg rule create \
             --resource-group %s \
@@ -389,8 +388,10 @@ def create_nfs_nsg():
         """ % ( config["azure_cluster"]["resource_group_name"],
                 config["azure_cluster"]["nfs_nsg_name"],
                 config["cloud_config"]["nfs_ssh"]["port"],
-                " ".join(list(set(config["cloud_config"]["nfs_ssh"]["source_ips"] + source_addresses_prefixes))),
+                " ".join(merged_ip),
                 )
+    if verbose:
+        print(cmd)
     if not no_execution:
         output = utils.exec_cmd_local(cmd)
         print(output)
@@ -466,8 +467,15 @@ def create_cluster(arm_vm_password=None, parallelism=1):
         create_vm_param(i, "infra", config["azure_cluster"]["infra_vm_size"],
                         arm_vm_password is not None, arm_vm_password)
 
+    add_workers(arm_vm_password, parallelism)
+
+    # create nfs server if specified.
+    for i in range(int(config["azure_cluster"]["nfs_node_num"])):
+            create_vm_param(i, "nfs", config["azure_cluster"]["nfs_vm_size"], False,
+               arm_vm_password, config["azure_cluster"]["nfs_vm"][i] if i < len(config["azure_cluster"]["nfs_vm"]) else None )
+
+def add_workers(arm_vm_password=None, parallelism=1):
     if config["priority"] == "regular":
-        print("entering")
         if parallelism > 1:
             # TODO: Tolerate faults
             from multiprocessing import Pool
@@ -479,15 +487,10 @@ def create_cluster(arm_vm_password=None, parallelism=1):
         else:
             for i in range(int(config["azure_cluster"]["worker_node_num"])):
                 create_vm_param(i, "worker", config["azure_cluster"]["worker_vm_size"],
-                                arm_vm_password is not None, arm_vm_password)
+                    arm_vm_password is not None, arm_vm_password)
     elif config["priority"] == "low":
         utils.render_template("./template/vmss/vmss.sh.template", "scripts/vmss.sh",config)
-        utils.exec_cmd_local("chmod +x scripts/vmss.sh;./scripts/vmss.sh")
-
-    # create nfs server if specified.
-    for i in range(int(config["azure_cluster"]["nfs_node_num"])):
-            create_vm_param(i, "nfs", config["azure_cluster"]["nfs_vm_size"], False,
-               arm_vm_password, config["azure_cluster"]["nfs_vm"][i] if i < len(config["azure_cluster"]["nfs_vm"]) else None )
+        utils.exec_cmd_local("chmod +x scripts/vmss.sh; ./scripts/vmss.sh")
 
 def create_vm_param_wrapper(arg_tuple):
     i, role, vm_size, no_az, arm_vm_password = arg_tuple
@@ -666,8 +669,8 @@ def get_disk_from_vm(vmname):
 def gen_cluster_config(output_file_name, output_file=True, no_az=False):
     if config["priority"] == "low":
         utils.render_template("./template/dns/cname_and_private_ips.sh.template", "scripts/cname_and_ips.sh", config)    
-        utils.exec_cmd_local("chmod +x scripts/cname_and_ips.sh")
-        print "\nPlease copy the commands in dns_add_commands and register the DNS records on http://servicebook/dns/self-service.html\n"
+        utils.exec_cmd_local("chmod +x scripts/cname_and_ips.sh; bash scripts/cname_and_ips.sh")
+        print "\nPlease copy the commands in dns_add_commands and register the DNS records \n"
     bSQLOnly = (config["azure_cluster"]["infra_node_num"] <= 0)
     if useAzureFileshare() and not no_az:
         # theoretically it could be supported, but would require storage account to be created first in nested template and then
@@ -718,14 +721,13 @@ def gen_cluster_config(output_file_name, output_file=True, no_az=False):
     cc["deploydockerETCD"] = False
     cc["platform-scripts"] = "ubuntu"
     cc["basic_auth"] = "%s,admin,1000" % uuid.uuid4().hex[:16]
-    domain_mapping = {"regular":"%s.cloudapp.azure.com" % config["azure_cluster"]["azure_location"], "low": config["domain_name"]}
+    domain_mapping = {"regular":"%s.cloudapp.azure.com" % config["azure_cluster"]["azure_location"], "low": config.get("domain_name",config["azure_cluster"]["default_low_priority_domain"])}
     if not bSQLOnly:
         cc["network"] = {"domain": domain_mapping[config["priority"]]}
 
     cc["machines"] = {}
     for i in range(int(config["azure_cluster"]["infra_node_num"])):
-        vmname = "%s-infra%02d" % (config["azure_cluster"]
-                                   ["cluster_name"].lower(), i + 1)
+        vmname = "{}-infra{:02d}".format(config["azure_cluster"]["cluster_name"], i + 1).lower()
         cc["machines"][vmname] = {"role": "infrastructure", "private-ip": get_vm_ip(i, "infra")}
 
     # Generate the workers in machines.
@@ -747,7 +749,7 @@ def gen_cluster_config(output_file_name, output_file=True, no_az=False):
             for l in rf:
                 worker_machines += l.split()[0],
         for vmname in worker_machines:
-            cc["machines"][vmname] = {"role": "worker","node-group": config["azure_cluster"]["worker_vm_size"],
+            cc["machines"][vmname.lower()] = {"role": "worker","node-group": config["azure_cluster"]["worker_vm_size"],
                                         "gpu-type":sku_mapping[config["azure_cluster"]["worker_vm_size"]]["gpu-type"]}
     elif config["priority"] == "regular":
         for vm in vm_list:
@@ -756,18 +758,18 @@ def gen_cluster_config(output_file_name, output_file=True, no_az=False):
                 worker_machines += vmname,
         for vmname in worker_machines:          
             if isNewlyScaledMachine(vmname):
-                cc["machines"][vmname] = {
+                cc["machines"][vmname.lower()] = {
                     "role": "worker", "scaled": True,
                     "node-group": vm["vmSize"],"gpu-type":sku_mapping.get(vm["vmSize"],sku_mapping["default"])["gpu-type"]}
             else:
-                cc["machines"][vmname] = {
+                cc["machines"][vmname.lower()] = {
                     "role": "worker",
                     "node-group": vm["vmSize"],"gpu-type":sku_mapping.get(vm["vmSize"],sku_mapping["default"])["gpu-type"]}
     nfs_nodes = []
     for vm in vm_list:
         vmname = vm["name"]
         if "-nfs" in vmname:
-            cc["machines"][vmname] = {
+            cc["machines"][vmname.lower()] = {
                 "role": "nfs",
                 "node-group": vm["vmSize"]}
 
@@ -920,10 +922,12 @@ def run_command(args, command, nargs, parser):
     else:
         check_subscription()
     if command == "create":
-        # print config["azure_cluster"]["infra_vm_size"]
         create_cluster(args.arm_password, args.parallelism)
         vm_interconnects()
 
+    elif command == "addworkers":
+        add_workers(args.arm_password, args.parallelism)
+        vm_interconnects()
     elif command == "list":
         list_vm()
 
@@ -964,7 +968,6 @@ def run_command(args, command, nargs, parser):
 if __name__ == '__main__':
     # the program always run at the current directory.
     dirpath = os.path.dirname(os.path.abspath(os.path.realpath(__file__)))
-    # print "Directory: " + dirpath
     os.chdir(dirpath)
     config = init_config()
     parser = argparse.ArgumentParser(prog='az_utils.py',
@@ -1089,15 +1092,11 @@ def run_command(args, command, nargs, parser):
     if os.path.exists(config_file):
         with open(config_file) as cf:
             tmpconfig = yaml.load(cf)
-            assert tmpconfig["cluster_name"] in tmpconfig["azure_cluster"]
         merge_config(config, tmpconfig, verbose)
         if tmpconfig is not None and "cluster_name" in tmpconfig:
             config["azure_cluster"]["cluster_name"] = tmpconfig["cluster_name"]
         if tmpconfig is not None and "datasource" in tmpconfig:
             config["azure_cluster"]["datasource"] = tmpconfig["datasource"]
-    if tmpconfig is not None and "azure_cluster" in tmpconfig and config["azure_cluster"]["cluster_name"] in tmpconfig["azure_cluster"]:
-        merge_config(config["azure_cluster"], tmpconfig["azure_cluster"][
-                     config["azure_cluster"]["cluster_name"]], verbose)
     if (args.cluster_name is not None):
         config["azure_cluster"]["cluster_name"] = args.cluster_name
 
@@ -1123,7 +1122,6 @@ def run_command(args, command, nargs, parser):
         config["azure_cluster"]["file_share_name"] = args.file_share_name
 
     config = update_config(config)
-    # print (config)
 
     with open(config_cluster, 'w') as outfile:
         yaml.dump(config, outfile, default_flow_style=False)
diff --git a/src/ClusterBootstrap/deploy.py b/src/ClusterBootstrap/deploy.py
index 51d2c7759..5c854b55c 100755
--- a/src/ClusterBootstrap/deploy.py
+++ b/src/ClusterBootstrap/deploy.py
@@ -96,7 +96,6 @@ def expand_path_in_config(key_in_config):
         raise Exception("Error: no %s in config " % key_in_config)
 
 def parse_capacity_in_GB( inp ):
-    # print "match capacity of %s" % inp
     mt = capacityMatch.search(inp)
     if mt is None:
         return 0.0
@@ -500,7 +499,6 @@ def init_deployment():
     utils.render_template( template_file, target_file ,config)
 
 def check_node_availability(ipAddress):
-    # print "Check node availability on: " + str(ipAddress)
     status = os.system('ssh -o "StrictHostKeyChecking no" -o "UserKnownHostsFile=/dev/null" -i %s -oBatchMode=yes %s@%s hostname > /dev/null' % (config["admin_username"], config["ssh_cert"], ipAddress))
     #status = sock.connect_ex((ipAddress,22))
     return status == 0
@@ -513,7 +511,7 @@ def get_domain():
         domain = ""
     return domain
 
-# Get a list of nodes from cluster.yaml
+# Get a list of nodes DNS from cluster.yaml
 def get_nodes_from_config(machinerole):
     machinerole = "infrastructure" if machinerole == "infra" else machinerole
     if "machines" not in config:
@@ -572,6 +570,13 @@ def get_ETCD_master_nodes_from_cluster_portal(clusterId):
 
 def get_ETCD_master_nodes_from_config(clusterId):
     Nodes = get_nodes_from_config("infrastructure")
+    if int(config["etcd_node_num"]) == 1:
+        for nodename in config["machines"]:
+            nodeInfo = config["machines"][nodename]
+            if "role" in nodeInfo and nodeInfo["role"]=="infrastructure":
+                assert "private-ip" in nodeInfo and "private IP of the infrastructure node is not provided!"
+                config["etcd_private_ip"] = nodeInfo["private-ip"]
+                break
     config["etcd_node"] = Nodes
     config["kubernetes_master_node"] = Nodes
     return Nodes
@@ -705,8 +710,8 @@ def GetCertificateProperty():
             masterdns.append(value)
 
     config["apiserver_ssl_dns"] = "\n".join(["DNS."+str(i+5)+" = "+dns for i,dns in enumerate(masterdns)])
-    config["apiserver_ssl_ip"] = "IP.1 = "+config["api-server-ip"]+"\nIP.2 = 127.0.0.1\n"+ "\n".join(["IP."+str(i+3)+" = "+ip for i,ip in enumerate(masterips)])
-
+    config["apiserver_ssl_ip"] = "\n".join(["IP.{} = {}".format(i, sslip) for i, sslip in enumerate([config["api-server-ip"]] + config["ssl_localhost_ips"] + masterips)])
+    # config["apiserver_ssl_ip"] = "IP.1 = "+config["api-server-ip"]+"\nIP.2 = 127.0.0.1\n"+ "\n".join(["IP."+str(i+3)+" = "+ip for i,ip in enumerate(masterips)])
 
     # kube-apiserver aggregator use easyrsa to generate crt files, we need to generate a group of master names for it.
     # It does not care if it's a DNS name or IP.
@@ -725,7 +730,8 @@ def GetCertificateProperty():
             etcddns.append(value)
 
     config["etcd_ssl_dns"] = "\n".join(["DNS."+str(i+5)+" = "+dns for i,dns in enumerate(etcddns)])
-    config["etcd_ssl_ip"] = "IP.1 = 127.0.0.1\n" + "\n".join(["IP."+str(i+2)+" = "+ip for i,ip in enumerate(etcdips)])
+    config["etcd_ssl_ip"] = "\n".join(["IP.{} = {}".format(i, sslip) for i, sslip in enumerate(config["ssl_localhost_ips"] + etcdips)])
+    # config["etcd_ssl_ip"] = "IP.1 = 127.0.0.1\n" + "\n".join(["IP."+str(i+2)+" = "+ip for i,ip in enumerate(etcdips)])
 
 def gen_worker_certificates():
 
@@ -747,7 +753,38 @@ def gen_ETCD_certificates():
     utils.render_template_directory("./template/ssl", "./deploy/ssl",config)
     os.system("cd ./deploy/ssl && bash ./gencerts_etcd.sh")
 
-
+def load_az_params_as_default():
+    from az_params import default_az_parameters
+    # need az_params default, in case we don't have the key in config.yaml
+    default_cfg = { k: v for k, v in default_az_parameters.items() }
+    azure_cluster_cfg = { k: v for k, v in config["azure_cluster"].items() } if "azure_cluster" in config else {}
+    merge_config(config["azure_cluster"], default_cfg["azure_cluster"])
+    merge_config(config["azure_cluster"], azure_cluster_cfg)
+
+def on_premise_params():
+    print("Warning: remember to set parameters:\ngpu_count_per_node, gpu_type, worker_node_num\n when using on premise machine!")
+
+def load_platform_type():
+    platform_type = list(set(config.keys()) & set(config["supported_platform"]))
+    assert len(platform_type) == 1 and "platform type should be specified explicitly and unique!"
+    platform_type = platform_type[0]
+    config["platform_type"] = platform_type
+
+def gen_platform_wise_config():
+    load_platform_type()
+    azdefault = { 'network_domain':"config['network']['domain']", 
+        'worker_node_num':"config['azure_cluster']['worker_node_num']", 
+        'gpu_count_per_node':'config["sku_mapping"].get(config["azure_cluster"]["worker_vm_size"],config["sku_mapping"]["default"])["gpu-count"]',
+        'gpu_type':'config["sku_mapping"].get(config["azure_cluster"]["worker_vm_size"],config["sku_mapping"]["default"])["gpu-type"]' }
+    on_premise_default = {'network_domain':"config['network']['domain']"}
+    platform_dict = { 'azure_cluster': azdefault, 'onpremise': on_premise_default }
+    platform_func = { 'azure_cluster': load_az_params_as_default, 'onpremise': on_premise_params } 
+    default_dict, default_func = platform_dict[config["platform_type"]], platform_func[config["platform_type"]]
+    default_func()
+    need_val = ['network_domain', 'worker_node_num', 'gpu_count_per_node', 'gpu_type']
+    for ky in need_val:
+        if ky not in config:
+            config[ky] = eval(default_dict[ky])
 
 def gen_configs():
     print "==============================================="
@@ -805,6 +842,7 @@ def gen_configs():
     add_ssh_key()
 
     check_config(config)
+    gen_platform_wise_config()
 
     utils.render_template_directory("./template/etcd", "./deploy/etcd",config)
     utils.render_template_directory("./template/master", "./deploy/master",config)
@@ -950,32 +988,32 @@ def deploy_masters(force = False):
     deploycmd = """
         until curl -q http://127.0.0.1:8080/version/ ; do
             sleep 5;
-            echo 'waiting for master...';
+            echo 'waiting for master kubernetes service...';
         done;
 
         until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/weave.yaml --validate=false ; do
             sleep 5;
-            echo 'waiting for master...';
+            echo 'waiting for master kube-addons weave...';
         done ;
 
         until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/dashboard.yaml --validate=false ; do
             sleep 5;
-            echo 'waiting for master...';
+            echo 'waiting for master kube-addons dashboard...';
         done ;
 
         until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/dns-addon.yml --validate=false ;  do
             sleep 5;
-            echo 'waiting for master...';
+            echo 'waiting for master kube-addons dns-addon...';
         done ;
 
         until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/kube-proxy.json --validate=false ;  do
             sleep 5;
-            echo 'waiting for master...';
+            echo 'waiting for master kube-addons kube-proxy.json...';
         done ;
 
         until sudo /opt/bin/kubectl create -f /etc/kubernetes/clusterroles/ ;  do
             sleep 5;
-            echo 'waiting for master...';
+            echo 'waiting for master kubernetes clusterroles...';
         done ;
         sudo ln -s /opt/bin/kubectl /usr/bin/;
     """
@@ -1047,7 +1085,7 @@ def deploy_ETCD_docker():
 
 
 def deploy_ETCD():
-
+    # this condition would not be satisfied at least when deploying new clusters
     if "deploydockerETCD" in config and config["deploydockerETCD"]:
         deploy_ETCD_docker()
         return
@@ -1110,17 +1148,16 @@ def set_nfs_disk():
     """
     we assume there's only 1 cluster.
     """
+    load_platform_type()
     etcd_server_user = config["nfs_user"]
     nfs_servers = config["nfs_node"] if len(config["nfs_node"]) > 0 else config["etcd_node"]
     machine_name_2_full = {nm.split('.')[0]:nm for nm in nfs_servers}
     for srvr_nm, nfs_cnf in config["nfs_disk_mnt"].items():
-        nfs_cnf["cloud_config"] = {"vnet_range":config["cloud_config"]["vnet_range"], "samba_range": config["cloud_config"]["samba_range"]}
+        nfs_cnf["nfs_client_CIDR"] = config["nfs_client_CIDR"]
+        nfs_cnf["platform_type"] = config["platform_type"]
         nfs_server = machine_name_2_full[srvr_nm]
-        # print nfs_cnf, nfs_server
-        utils.render_template("./template/nfs/nfs_config.sh.template","./deploy/scripts/setup_nfs_server.sh",nfs_cnf)
-        # os.system("cat ./deploy/scripts/setup_nfs_server.sh")
-        # print("------------------>nfs_server<------------------------"+nfs_server)
-        utils.SSH_exec_script( config["ssh_cert"], etcd_server_user, nfs_server, "./deploy/scripts/setup_nfs_server.sh")
+        utils.render_template("./template/nfs/nfs_config.sh.template", "./scripts/setup_nfs_server.sh", nfs_cnf)
+        utils.SSH_exec_script( config["ssh_cert"], etcd_server_user, nfs_server, "./scripts/setup_nfs_server.sh")
 
 def create_ISO():
     imagename = "./deploy/iso/dlworkspace-cluster-deploy-"+config["cluster_name"]+".iso"
@@ -1141,18 +1178,10 @@ def create_PXE():
     os.system("cp -r ./deploy/cloud-config/* ./deploy/pxe/tftp/usr/share/oem")
 
     dockername = push_one_docker("./deploy/pxe", config["dockerprefix"], config["dockertag"], "pxe-coreos", config )
-
-    #tarname = "deploy/docker/dlworkspace-pxe-%s.tar" % config["cluster_name"]
-    # os.system("docker save " + dockername + " > " + tarname )
     print ("A DL workspace docker is built at: "+ dockername)
-    # print ("It is also saved as a tar file to: "+ tarname)
-
-    #os.system("docker rmi dlworkspace-pxe:%s" % config["cluster_name"])
 
 def config_ubuntu():
-    # print config["ubuntuconfig"]
     ubuntuConfig = fetch_config( config, ["ubuntuconfig"] )
-    # print ubuntuConfig
     useversion = fetch_dictionary( ubuntuConfig, [ "version" ] )
     specificConfig = fetch_dictionary( ubuntuConfig, [ useversion ] )
     for key, value in specificConfig.iteritems():
@@ -1166,11 +1195,8 @@ def create_PXE_ubuntu():
     utils.render_template_directory("./template/pxe-ubuntu", "./deploy/pxe-ubuntu",config, verbose=verbose )
 
     dockername = push_one_docker("./deploy/pxe-ubuntu", config["dockerprefix"], config["dockertag"], "pxe-ubuntu", config )
-    # tarname = "deploy/docker/pxe-ubuntu.tar"
 
-    # os.system("docker save " + dockername + " > " + tarname )
     print ("A DL workspace docker is built at: "+ dockername)
-    # print ("It is also saved as a tar file to: "+ tarname)
 
 
 def clean_worker_nodes():
@@ -1256,7 +1282,6 @@ def update_worker_nodes( nargs ):
     os.system('sed "s/##api_servers##/%s/" ./deploy/kubelet/kubelet.service.template > ./deploy/kubelet/kubelet.service' % config["api_servers"].replace("/","\\/"))
     os.system('sed "s/##api_servers##/%s/" ./deploy/kubelet/worker-kubeconfig.yaml.template > ./deploy/kubelet/worker-kubeconfig.yaml' % config["api_servers"].replace("/","\\/"))
 
-    #urllib.urlretrieve ("http://ccsdatarepo.westus.cloudapp.azure.com/data/kube/kubelet/kubelet", "./deploy/bin/kubelet")
     get_hyperkube_docker()
 
     workerNodes = get_worker_nodes(config["clusterId"], False)
@@ -1269,10 +1294,6 @@ def update_worker_nodes( nargs ):
     os.system("rm ./deploy/kubelet/kubelet.service")
     os.system("rm ./deploy/kubelet/worker-kubeconfig.yaml")
 
-    #if len(config["kubernetes_master_node"]) > 0:
-        #utils.SSH_exec_cmd(config["ssh_cert"], config["admin_username"], config["kubernetes_master_node"][0], "sudo /opt/bin/kubelet get nodes")
-
-
 def update_worker_nodes_in_parallel(nargs):
     # TODO: Merge with update_worker_nodes
     utils.render_template_directory("./template/kubelet", "./deploy/kubelet", config)
@@ -1335,12 +1356,9 @@ def deploy_restful_API_on_node(ipAddress):
         utils.SSH_exec_cmd(config["ssh_cert"], config["admin_username"], masterIP, "sudo chown -R %s /etc/kubernetes" % config["admin_username"])
         utils.SSH_exec_cmd(config["ssh_cert"], config["admin_username"], masterIP, "sudo cp /etc/kubernetes/certs/client.crt /etc/kubernetes/ssl/apiserver.pem")
         utils.SSH_exec_cmd(config["ssh_cert"], config["admin_username"], masterIP, "sudo cp /etc/kubernetes/certs/client.key /etc/kubernetes/ssl/apiserver-key.pem")
-        utils.SSH_exec_cmd(config["ssh_cert"], config["admin_username"], masterIP, "sudo cp /etc/kubernetes/certs/ca.crt /etc/kubernetes/ssl/ca.pem")
-        # overwrite ~/.kube/config (to be mounted from /etc/kubernetes/restapi-kubeconfig.yaml)
+        utils.SSH_exec_cmd(config["ssh_cert"], config["admin_username"], masterIP, "sudo cp /etc/kubernetes/certs/ca.crt /etc/kubernetes/ssl/ca.pem")    
         utils.SSH_exec_cmd(config["ssh_cert"], config["admin_username"], masterIP, "sudo cp /home/%s/.kube/config /etc/kubernetes/restapi-kubeconfig.yaml" % config["admin_username"])
 
-    # utils.SSH_exec_cmd(config["ssh_cert"], config["admin_username"], masterIP, "sudo mkdir -p /dlws-data && sudo mount %s /dlws-data ; docker rm -f restfulapi; docker rm -f jobScheduler ; docker pull %s ; docker run -d -p %s:80 --restart always -v /etc/RestfulAPI:/RestfulAPI --name restfulapi %s ; docker run -d -v /dlws-data:/dlws-data -v /etc/RestfulAPI:/RestfulAPI -v /etc/kubernetes/restapi-kubeconfig.yaml:/root/.kube/config -v /etc/kubernetes/ssl:/etc/kubernetes/ssl --restart always --name jobScheduler %s /runScheduler.sh ;" % (config["nfs-server"], dockername,config["restfulapiport"],dockername,dockername))
-
     print "==============================================="
     print "restful api is running at: http://%s:%s" % (masterIP,config["restfulapiport"])
     config["restapi"] = "http://%s:%s" %  (masterIP,config["restfulapiport"])
@@ -1385,7 +1403,6 @@ def deploy_webUI_on_node(ipAddress):
     utils.sudo_scp(config["ssh_cert"],"./deploy/WebUI/dashboardConfig.json","/etc/WebUI/dashboardConfig.json", sshUser, webUIIP )
 
     utils.render_template("./template/WebUI/Master-Templates.json", "./deploy/WebUI/Master-Templates.json", config)
-    #os.system("cp --verbose ./template/WebUI/Master-Templates.json ./deploy/WebUI/Master-Templates.json")
     os.system("cp --verbose ./deploy/WebUI/Master-Templates.json ../WebUI/dotnet/WebPortal/Master-Templates.json")
     utils.sudo_scp(config["ssh_cert"],"./deploy/WebUI/Master-Templates.json","/etc/WebUI/Master-Templates.json", sshUser, webUIIP )
 
@@ -1395,8 +1412,6 @@ def deploy_webUI_on_node(ipAddress):
     utils.sudo_scp(config["ssh_cert"],"./deploy/RestfulAPI/config.yaml","/etc/RestfulAPI/config.yaml", sshUser, webUIIP )
 
 
-    # utils.SSH_exec_cmd(config["ssh_cert"], sshUser, webUIIP, "docker pull %s ; docker rm -f webui ; docker run -d -p %s:80 -v /etc/WebUI:/WebUI --restart always --name webui %s ;" % (dockername,str(config["webuiport"]),dockername))
-
     print "==============================================="
     print "Web UI is running at: http://%s:%s" % (webUIIP,str(config["webuiport"]))
 
@@ -1547,9 +1562,6 @@ def acs_untaint_nodes():
 def acs_post_deploy():
     # set nodes
     get_nodes(config["clusterId"])
-    #print "Master: {0}".format(config["kubernetes_master_node"])
-    #print "Worker: {0}".format(config["worker_node"])
-
     # Label nodes
     acs_label_webui()
     kubernetes_label_nodes("active", [], args.yes)
@@ -1565,9 +1577,6 @@ def acs_post_deploy():
     # get CNI binary
     get_cni_binary()
     # deploy
-    #print config["master_predeploy"]
-    #print config["master_filesdeploy"]
-    #print config["master_postdeploy"]
     deploy_on_nodes(config["master_predeploy"], config["master_filesdeploy"], config["master_postdeploy"],
                     config["kubernetes_master_node"])
     deploy_on_nodes(config["worker_predeploy"], config["worker_filesdeploy"], config["worker_postdeploy"],
@@ -1577,7 +1586,6 @@ def acs_post_deploy():
 def acs_prepare_machines():
     nodes = get_nodes(config["clusterId"])
     for node in nodes:
-        #exec_rmt_cmd(node, "curl -L -sf https://raw.githubusercontent.com/ritazh/acs-k8s-gpu/master/install-nvidia-driver.sh | sudo sh")
         run_script(node, ["./scripts/prepare_ubuntu.sh"], True)
         # restart kubelet incase GPU installed
         utils.SSH_exec_cmd(config["ssh_cert"], config["admin_username"], node, "sudo systemctl restart kubelet.service")
@@ -1603,9 +1611,7 @@ def get_mount_fileshares(curNode = None):
     physicalmountpoint = config["physical-mount-path"]
     storagemountpoint = config["storage-mount-path"]
     mountshares = {}
-    # print(config["mountpoints"])
     for k,v in config["mountpoints"].iteritems():
-        # print("<<<<<<<<<<<<<<<<<<<new mount points:", v)
         if "type" in v:
             if ("mountpoints" in v):
                 if isinstance( v["mountpoints"], basestring):
@@ -1617,7 +1623,6 @@ def get_mount_fileshares(curNode = None):
                     mountpoints = v["mountpoints"]
             else:
                 mountpoints = []
-            # print("-------------mount points---------------:", mountpoints, bHasDefaultMountPoints)
             if len(mountpoints)==0:
                 if bHasDefaultMountPoints:
                     errorMsg = "there are more than one default mount points in configuration. "
@@ -1703,7 +1708,6 @@ def get_mount_fileshares(curNode = None):
                 allmountpoints[k]["mountpoints"] = mountpoints
         else:
             print "Error: fileshare %s with no type" %( k )
-    # print allmountpoints
     return allmountpoints, fstab
 
 def insert_fstab_section( node, secname, content):
@@ -1885,13 +1889,11 @@ def mount_fileshares_by_service(perform_mount=True):
             # insert_fstab_section( node, "DLWS", fstab )
     for k, v in allmountpoints.iteritems():
         allmountpoints[k].pop("accesskey", None)
-    # print mountpoints
     return allmountpoints
 
 def unmount_fileshares_by_service(clean=False):
     all_nodes = get_nodes(config["clusterId"])
     allmountpoints, fstab = get_mount_fileshares()
-    # print fstab
     if True:
         nodes = all_nodes
         for node in nodes:
@@ -1925,7 +1927,6 @@ def del_fileshare_links():
 
 def link_fileshares(allmountpoints, bForce=False):
     all_nodes = get_nodes(config["clusterId"])
-    # print fstab
     if True:
         nodes = all_nodes
         firstdirs = {}
@@ -1987,14 +1988,9 @@ def get_partions_of_node(node, prog):
     if verbose:
         print node
         print output
-    # print output
     drives = prog.search( output )
-    # print(drives)
     drivesInfo = prog.split( output )
-    # print len(drivesInfo)
     ndrives = len(drivesInfo)/2
-    #for i in range(len(drivesInfo)):
-    #    print "Segment %d: %s" %(i, drivesInfo[i])
 
     partinfo = {}
     blockdevice = 1
@@ -2007,10 +2003,7 @@ def get_partions_of_node(node, prog):
         modelName = drivesInfo[i*2][pos_model+7:].splitlines()[0] if pos_model >=0 else "None"
         drivename = drivesInfo[i*2+1] + drivesInfo[i*2+2][:pos_semi]
         driveString = drivesInfo[i*2+2][pos_semi+1:]
-        #print "Drive Name: " + drivename
-        #print "Drive String: " + driveString
         if not (prog.match(drivename) is None):
-            # print driveString
             capacity = parse_capacity_in_GB( driveString )
             lines = driveString.splitlines()
 
@@ -2037,7 +2030,6 @@ def get_partions_of_node(node, prog):
             if capacity > 0 and len(parted)==0:
                 parted[0] = capacity
 
-            # print drivename + " Capacity: " + str(capacity) + " GB, " + str(parted)
             deviceinfo["modelName"] = modelName
             deviceinfo["name"] = drivename
             deviceinfo["capacity"] = capacity
@@ -2078,8 +2070,6 @@ def calculate_partitions( capacity, partitionConfig):
     npart = len(partitionConfig)
     partitionSize = [0.0]*npart
     sumProportion = 0.0
-    #print "Beginning Capacity " + str(capacity)
-    #print partitionSize
     for i in range(npart):
         if partitionConfig[i] < 0.0:
             if capacity > 0.0:
@@ -2089,8 +2079,6 @@ def calculate_partitions( capacity, partitionConfig):
                 partitionSize[i] = 0.0
         else:
             sumProportion += partitionConfig[i]
-    #print "Ending Capacity " + str(capacity)
-    #print partitionSize
     for i in range(npart):
         if partitionConfig[i] >= 0.0:
             if sumProportion == 0.0:
@@ -2112,12 +2100,10 @@ def repartition_nodes(nodes, nodesinfo, partitionConfig):
                 removedPartitions = []
                 for part in existingPartitions:
                     removedPartitions.append(part)
-                # print removedPartitions
                 removedPartitions.sort(reverse=True)
                 for part in removedPartitions:
                     cmd += "sudo parted -s " + deviceinfo["name"] + " rm " + str(part) + "; "
             partitionSize = calculate_partitions( deviceinfo["capacity"], partitionConfig)
-            # print partitionSize
             totalPartitionSize = sum( partitionSize )
             start = 0
             npart = len(partitionSize)
@@ -2170,7 +2156,6 @@ def regmatch_glusterFS( glusterFSargs ):
         regexp = "/dev/[s|h]d[^a]"+str(glusterFSargs)
     else:
         regexp = glusterFSargs
-    #print regexp
     regmatch = re.compile(regexp)
     return regmatch
 
@@ -2180,11 +2165,9 @@ def find_matched_volume( alldeviceinfo, regmatch ):
         deviceinfo = alldeviceinfo[bdevice]
         for part in deviceinfo["parted"]:
             bdevicename = deviceinfo["name"] + str(part)
-            # print bdevicename
             match = regmatch.search(bdevicename)
             if not ( match is None ):
                 deviceList[match.group(0)] = deviceinfo["parted"][part]
-    #print deviceList;
     return deviceList
 
 # Form a configuration file for operation of glusterfs
@@ -2244,8 +2227,6 @@ def stop_glusterFS_endpoint( ):
 
 def format_mount_partition_volume( nodes, deviceSelect, format=True ):
     nodesinfo = get_partitions(nodes, deviceSelect )
-    #if verbose:
-    #    print nodesinfo
     reg = re.compile( deviceSelect )
     for node in nodesinfo:
         alldeviceinfo = nodesinfo[node]
@@ -2269,8 +2250,6 @@ def format_mount_partition_volume( nodes, deviceSelect, format=True ):
 
 def unmount_partition_volume( nodes, deviceSelect ):
     nodesinfo = get_partitions(nodes, deviceSelect )
-    #if verbose:
-    #    print nodesinfo
     reg = re.compile( deviceSelect )
     for node in nodesinfo:
         alldeviceinfo = nodesinfo[node]
@@ -2327,8 +2306,6 @@ def hdfs_config( nodes, deviceSelect):
     if verbose:
         print "HDFS Configuration: %s " % hdfsconfig
     nodesinfo = get_partitions(nodes, deviceSelect )
-    #if verbose:
-    #    print nodesinfo
     reg = re.compile( deviceSelect )
     for node in nodesinfo:
         alldeviceinfo = nodesinfo[node]
@@ -2369,13 +2346,10 @@ def create_glusterFS_volume( nodesinfo, glusterFSargs ):
     utils.render_template_directory("./storage/glusterFS", "./deploy/storage/glusterFS", config, verbose)
     config_glusterFS = write_glusterFS_configuration( nodesinfo, glusterFSargs )
     regmatch = regmatch_glusterFS(glusterFSargs)
-    # print nodesinfo
     for node in nodesinfo:
         alldeviceinfo = nodesinfo[node]
         volumes = find_matched_volume( alldeviceinfo, regmatch )
         print "................. Node %s ................." % node
-        # print volumes
-        # print alldeviceinfo
         remotecmd = ""
         remotecmd += "sudo modprobe dm_thin_pool; "
         remotecmd += "sudo apt-get install -y thin-provisioning-tools; "
@@ -2463,7 +2437,6 @@ def remove_glusterFS_volume( nodesinfo, glusterFSargs ):
             break;
         for volume in volumes:
             remotecmd += "sudo pvremove -y %s; " % volume
-        # print remotecmd
         utils.SSH_exec_cmd( config["ssh_cert"], config["admin_username"], node, remotecmd )
 
 def display_glusterFS_volume( nodesinfo, glusterFSargs ):
@@ -2573,7 +2546,6 @@ def create_mac_dictionary( machineEntry ):
                     add_mac_dictionary(dic, name, mac)
             else:
                 print "Error, machine " + name + ", mac entry is of unknown type: " + str(macs)
-    #print dic
     return dic
 
 def set_host_names_by_lookup():
@@ -2594,7 +2566,6 @@ def set_host_names_by_lookup():
             if len(namelist) > 1:
                 print "Error, machine with mac "+str(macs)+" has more than 1 name entries " +str(namelist)
             elif len(namelist) == 0:
-                # print "Warning, cannot find an entry for machine with mac "+str(macs)
                 hostname = node.split(".")[0]
                 cmd = "sudo hostnamectl set-hostname " + hostname
                 print "Set hostname of node " + node + " to " + hostname
@@ -2699,7 +2670,6 @@ def kubernetes_get_node_name(node):
     if len(domain) < 2:
         kube_node_name = node
     elif domain in node:
-        # print "Remove domain %d" % len(domain)
         kube_node_name = node[:-(len(domain))]
     else:
         kube_node_name = node
@@ -2759,7 +2729,6 @@ def get_service_name(service_config_file):
     except:
         return None
     f.close()
-    # print service_config
     name = fetch_dictionary(service_config, ["metadata","name"])
     if not name is None:
         return name
@@ -2772,13 +2741,11 @@ def get_service_name(service_config_file):
 
 def get_service_yaml( use_service ):
     servicedic = get_all_services()
-    #print    servicedic
     newentries = {}
     for service in servicedic:
         servicename = get_service_name(servicedic[service])
         newentries[servicename] = servicedic[service]
     servicedic.update(newentries)
-    #print servicedic
     fname = servicedic[use_service]
     return fname
 
@@ -2826,17 +2793,12 @@ def get_node_lists_for_service(service):
 # The kubernete node will be marked accordingly to facilitate the running of daemon service.
 def kubernetes_label_nodes( verb, servicelists, force ):
     servicedic = get_all_services()
-    # print servicedic
     get_nodes(config["clusterId"])
     labels = fetch_config(config, ["kubelabels"])
-    # print labels
     for service, serviceinfo in servicedic.iteritems():
         servicename = get_service_name(servicedic[service])
-        # print "Service %s - %s" %(service, servicename )
         if (not service in labels) and (not servicename in labels) and "default" in labels and (not servicename is None):
             labels[servicename] = labels["default"]
-    # print servicelists
-    # print labels
     if len(servicelists)==0:
         servicelists = labels
     else:
@@ -2865,7 +2827,7 @@ def kubernetes_label_nodes( verb, servicelists, force ):
 # Label kubernete nodes with gpu types.skip for CPU workers
 def kubernetes_label_GpuTypes():
     for nodename,nodeInfo in config["machines"].items():
-        if nodeInfo["role"] == "worker" and nodeInfo["gpu-type"] != "NULL":
+        if nodeInfo["role"] == "worker":
             kubernetes_label_node("--overwrite", nodename, "gpuType="+nodeInfo["gpu-type"])
 
 
@@ -2925,7 +2887,6 @@ def stop_one_kube_service(fname):
 
 def start_kube_service( servicename ):
     fname = get_service_yaml( servicename )
-    # print "start service %s with %s" % (servicename, fname)
     dirname = os.path.dirname(fname)
     if os.path.exists(os.path.join(dirname,"launch_order")) and "/" not in servicename:
         with open(os.path.join(dirname,"launch_order"),'r') as f:
@@ -3000,7 +2961,6 @@ def check_buildable_images(nargs):
 def run_docker_image( imagename, native = False, sudo = False ):
     dockerConfig = fetch_config( config, ["docker-run", imagename ])
     full_dockerimage_name, local_dockerimage_name = build_docker_fullname( config, imagename )
-    # print full_dockerimage_name
     matches = find_dockers( full_dockerimage_name )
     if len( matches ) == 0:
         matches = find_dockers( local_dockerimage_name )
@@ -3027,7 +2987,6 @@ def gen_warm_up_cluster_script():
 
 def run_command( args, command, nargs, parser ):
     # If necessary, show parsed arguments.
-    # print args
     global discoverserver
     global homeinserver
     global verbose
@@ -3063,7 +3022,6 @@ def run_command( args, command, nargs, parser ):
 
 
     config_file = os.path.join(dirpath,"config.yaml")
-    # print "Config file: " + config_file
     if not os.path.exists(config_file):
         parser.print_help()
         print "ERROR: config.yaml does not exist!"
@@ -3072,7 +3030,6 @@ def run_command( args, command, nargs, parser ):
     f = open(config_file)
     merge_config(config, yaml.load(f))
     f.close()
-    # print config
     if os.path.exists("./deploy/clusterID.yml"):
         f = open("./deploy/clusterID.yml")
         tmp = yaml.load(f)
@@ -3145,7 +3102,6 @@ def run_command( args, command, nargs, parser ):
     elif command == "connect":
             check_master_ETCD_status()
             role2connect = nargs[0]
-            # print(role2connect, config["ssh_cert"], config["admin_username"])
             if len(nargs) < 1 or role2connect == "master":
                 nodes = config["kubernetes_master_node"]
             elif role2connect in ["etcd", "worker", "nfs", "samba"]:
@@ -3509,7 +3465,6 @@ def run_command( args, command, nargs, parser ):
 
     elif command == "runscriptonall" and len(nargs)>=1:
         nodes = get_nodes(config["clusterId"])
-        # print(nodes)
         run_script_on_all(nodes, nargs, sudo = args.sudo )
 
     elif command == "runscriptonallinparallel" and len(nargs)>=1:
@@ -3526,8 +3481,7 @@ def run_command( args, command, nargs, parser ):
             else:
                 break
         nodes = get_nodes_by_roles(nodeset)
-        # print(nodes)
-        run_script_on_all(nodes, nargs[scripts_start:], sudo = args.sudo )
+        run_script_on_all_in_parallel(nodes, nargs[scripts_start:], sudo = args.sudo )
 
     elif command == "runscriptonrandmaster" and len(nargs)>=1:
         run_script_on_rand_master(nargs, args)
@@ -3680,7 +3634,6 @@ def run_command( args, command, nargs, parser ):
                 servicenames = []
                 for service in allservices:
                     servicenames.append(service)
-                # print servicenames
             generate_hdfs_containermounts()
             configuration( config, verbose )
             if nargs[0] == "start":
@@ -3737,6 +3690,7 @@ def run_command( args, command, nargs, parser ):
         kubernetes_label_GpuTypes()
 
     elif command == "genscripts":
+        gen_platform_wise_config()
         gen_dns_config_script()
         gen_pass_secret_script()
         gen_warm_up_cluster_script()
@@ -3929,32 +3883,32 @@ def upgrade_masters(hypekube_url="gcr.io/google-containers/hyperkube:v1.15.2"):
     deploy_cmd = """
         until curl -q http://127.0.0.1:8080/version/ ; do
             sleep 5;
-            echo 'waiting for master...';
+            echo 'waiting for master kubernetes service...';
         done;
 
         until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/weave.yaml --validate=false ; do
             sleep 5;
-            echo 'waiting for master...';
+            echo 'waiting for master kube-addons weave...';
         done ;
 
         until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/dashboard.yaml --validate=false ; do
             sleep 5;
-            echo 'waiting for master...';
+            echo 'waiting for master kube-addons dashboard...';
         done ;
 
         until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/dns-addon.yml --validate=false ;  do
             sleep 5;
-            echo 'waiting for master...';
+            echo 'waiting for master kube-addons dns-addon...';
         done ;
 
         until sudo /opt/bin/kubectl apply -f /opt/addons/kube-addons/kube-proxy.json --validate=false ;  do
             sleep 5;
-            echo 'waiting for master...';
+            echo 'waiting for master kube-addons kube-proxy...';
         done ;
 
         until sudo /opt/bin/kubectl apply -f /etc/kubernetes/clusterroles/ ;  do
             sleep 5;
-            echo 'waiting for master...';
+            echo 'waiting for master kubernetes clusterroles...';
         done ;
     """
     utils.SSH_exec_cmd(config["ssh_cert"], kubernetes_master_user, kubernetes_masters[0], deploy_cmd , False)
@@ -3962,7 +3916,6 @@ def upgrade_masters(hypekube_url="gcr.io/google-containers/hyperkube:v1.15.2"):
 if __name__ == '__main__':
     # the program always run at the current directory.
     dirpath = os.path.dirname(os.path.abspath(os.path.realpath(__file__)))
-    # print "Directory: " + dirpath
     os.chdir(dirpath)
     parser = argparse.ArgumentParser( prog='deploy.py',
         formatter_class=argparse.RawDescriptionHelpFormatter,
diff --git a/src/ClusterBootstrap/params.py b/src/ClusterBootstrap/params.py
index f0451374b..f843a2499 100755
--- a/src/ClusterBootstrap/params.py
+++ b/src/ClusterBootstrap/params.py
@@ -1,9 +1,12 @@
 # These are the default configuration parameter
 default_config_parameters = {
+    "supported_platform": ["azure_cluster", "onpremise"],
     "allroles": {"infra", "infrastructure", "worker", "nfs", "sql", "dev"},
     # Kubernetes setting
     "service_cluster_ip_range": "10.3.0.0/16",
     "pod_ip_range": "10.2.0.0/16",
+    "ssl_localhost_ips": [ "127.0.0.1", "127.0.1.1" ],
+    "dns_server": {"azure_cluster": '8.8.8.8', 'onpremise':'10.50.10.50'},
     # Home in server, to aide Kubernete setup
     "homeinserver": "http://dlws-clusterportal.westus.cloudapp.azure.com:5000",
     "cloud_influxdb_node": "dlws-influxdb.westus.cloudapp.azure.com",
@@ -626,6 +629,11 @@
         },
     },
 
+    "nfs_client_CIDR": {
+        "node_range": ["192.168.0.0/16"],
+        "samba_range": [],
+    },
+
     "nfs_mnt_setup": [
           {
             "mnt_point": {"rootshare":{"curphysicalmountpoint":"/mntdlws/infranfs","filesharename":"/infradata/share","mountpoints":""}}}
@@ -634,7 +642,6 @@
         "VC-Default":["*"],
     },
     "registry_credential": {},
-    "domain_name": "redmond.corp.microsoft.com",
     "priority": "regular",
     "sku_mapping": {
         "Standard_ND6s":{"gpu-type": "P40","gpu-count": 1},
@@ -671,7 +678,7 @@
         "genscripts",
         "runscriptonroles infra worker ./scripts/dns.sh",
         "-y deploy",
-        "-y updateworker",
+        "-y updateworkerinparallel",
         "-y kubernetes labels",
         "-y gpulabel",
         "kubernetes start nvidia-device-plugin",
diff --git a/src/ClusterBootstrap/scripts/setup_nfs_server.sh b/src/ClusterBootstrap/scripts/setup_nfs_server.sh
deleted file mode 100755
index 48cc90586..000000000
--- a/src/ClusterBootstrap/scripts/setup_nfs_server.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-sudo apt-get update
-sudo apt-get install -y nfs-kernel-server
-
-sudo mkdir -p /data/share 
-sudo chown nobody:nogroup /data/share
-
-echo "/data/share {{cnf["cloud_config"]["vnet_range"]}}(rw,sync,no_subtree_check,no_root_squash)" | sudo tee /etc/exports
-sudo systemctl restart nfs-kernel-server
-
-
diff --git a/src/ClusterBootstrap/bash_step_by_step_deploy.sh b/src/ClusterBootstrap/step_by_step.sh
similarity index 62%
rename from src/ClusterBootstrap/bash_step_by_step_deploy.sh
rename to src/ClusterBootstrap/step_by_step.sh
index a36f4af06..0a2e5443c 100755
--- a/src/ClusterBootstrap/bash_step_by_step_deploy.sh
+++ b/src/ClusterBootstrap/step_by_step.sh
@@ -1,7 +1,12 @@
-./deploy.py -y build
-./az_tools.py create
-./az_tools.py genconfig
-./deploy.py runscriptonroles infra worker ./scripts/prepare_vm_disk.sh
+platform=$1
+if [ $platform == "azure" ]; then
+  ./deploy.py -y build
+  ./az_tools.py create
+  ./az_tools.py genconfig
+  ./deploy.py runscriptonroles infra worker ./scripts/prepare_vm_disk.sh
+elif [ $platform == "onpremise" ]; then
+  echo "make sure that you've run ./deploy.py build and set the correct ssh keys in deploy/sshkey before run this script"
+fi
 ./deploy.py nfs-server create
 ./deploy.py runscriptonroles infra worker ./scripts/prepare_ubuntu.sh
 ./deploy.py runscriptonroles infra worker ./scripts/disable_kernel_auto_updates.sh
@@ -9,7 +14,7 @@
 ./deploy.py genscripts
 ./deploy.py runscriptonroles infra worker ./scripts/dns.sh
 ./deploy.py -y deploy
-./deploy.py -y updateworker
+./deploy.py -y updateworkerinparallel
 ./deploy.py -y kubernetes labels
 ./deploy.py -y gpulabel
 ./deploy.py kubernetes start nvidia-device-plugin
@@ -17,6 +22,10 @@
 ./deploy.py webui
 ./deploy.py docker push restfulapi
 ./deploy.py docker push webui
+./deploy.py docker push watchdog
+./deploy.py docker push gpu-reporter
+./deploy.py docker push reaper
+./deploy.py docker push job-exporter
 ./deploy.py mount
 ./deploy.py kubernetes start mysql
 ./deploy.py kubernetes start jobmanager
diff --git a/src/ClusterBootstrap/template/RestfulAPI/config.yaml b/src/ClusterBootstrap/template/RestfulAPI/config.yaml
index 92f2575f5..26c47b435 100755
--- a/src/ClusterBootstrap/template/RestfulAPI/config.yaml
+++ b/src/ClusterBootstrap/template/RestfulAPI/config.yaml
@@ -33,10 +33,9 @@ webportal_node: {{cnf["webportal_node"]}}
 datasource : {{cnf["datasource"]}}
 kube_custom_scheduler: {{cnf["kube_custom_scheduler"]}}
 WinbindServers: {{cnf["WinbindServers"]}}
-azure_cluster :
-  worker_node_num : {{cnf["azure_cluster"][cnf["cluster_name"]]["worker_node_num"]}}
-  worker_vm_size : {{cnf["azure_cluster"][cnf["cluster_name"]]["worker_vm_size"]}}
-sku_mapping: {{cnf["sku_mapping"]}}
+gpu_count_per_node: {{cnf["gpu_count_per_node"]}}
+worker_node_num: {{cnf["worker_node_num"]}}
+gpu_type: {{cnf["gpu_type"]}}
 defalt_virtual_cluster_name: {{cnf["defalt_virtual_cluster_name"]}}
 {% if cnf["job-manager"] %}
 job-manager:
diff --git a/src/ClusterBootstrap/template/dns/cname_and_private_ips.sh.template b/src/ClusterBootstrap/template/dns/cname_and_private_ips.sh.template
new file mode 100755
index 000000000..09062eb86
--- /dev/null
+++ b/src/ClusterBootstrap/template/dns/cname_and_private_ips.sh.template
@@ -0,0 +1,40 @@
+# try to use immediately connectable IP/dns name, since the DNS records take time to broadcast. For workers, the Azure DNS name in fqdns_sorted is preferred, for infra, they have fixed public IPs
+
+# sort the worker machine names, append their <redmond-renamed DNS name, Azure DNS name> pairs to dns_add_commands
+
+az vmss list-instance-public-ips --name {{cnf["cluster_name"]}}-worker --resource-group {{cnf["cluster_name"]}}ResGrp | grep "fqdn" | awk '{print $2}' | sed 's/[",]//g' > fqdns
+first_line=$(head -n 1 fqdns)
+cat fqdns | cut -d'.' -f1 | cut -d'm' -f2 | sort -n | awk '{printf("vm%s.%s\n", $1, DOMAIN_SUFFIX)}' DOMAIN_SUFFIX=$(echo ${first_line#*.}) > fqdns_sorted
+# get host name (string, like lowpr68f3000000) of each worker node
+rm -rf hostnames && for fqdn in `cat fqdns_sorted`; do ssh -oStrictHostKeyChecking=no -i ./deploy/sshkey/id_rsa core@${fqdn} hostname >> hostnames; done
+paste -d' ' hostnames fqdns_sorted > hostname_fqdn_map
+cat hostname_fqdn_map | awk '{ printf("add %s.{{cnf["domain_name"]}} CNAME %s DLTSPAdmin\n", $1, $2) }' > dns_add_commands
+
+# get infra machine names and public ips, append their <redmond-renamed DNS name, ip> pairs to dns_add_commands
+
+# use [?contains(virtualMachine.name,'infra')] in query if want only infra
+# but since we can not ping redmond domain name when /etc/resolv.conf is 8.8.8.8, we need to add both infra and nfs DNS to hosts
+az vm list-ip-addresses -g {{cnf["cluster_name"]}}ResGrp --query "[].{Name:virtualMachine.name,pubIP:virtualMachine.network.publicIpAddresses[0].ipAddress, privIP:virtualMachine.network.privateIpAddresses[0]}" -o table | tail -n +3 > infra_ips
+cat infra_ips | awk '{ printf("add %s.{{cnf["domain_name"]}} A %s DLTSPAdmin\n", $1, $2) }' >> dns_add_commands
+
+# ADD HOSTS to all nodes
+
+# get private IP of worker nodes
+# this would not work: cannot guarantee order:az vmss nic list -g lowpriResGrp --vmss-name lowpri-worker --query [].{ip:ipConfigurations[0].privateIpAddress} -o tsv > private_ips
+
+rm -rf private_ips && for fqdn in `cat fqdns_sorted`; do ssh -oStrictHostKeyChecking=no -i ./deploy/sshkey/id_rsa core@${fqdn} hostname -I | awk '{print $1}' >> private_ips;done
+# add worker <private IP, fullname, machine name> triplets
+paste -d' ' private_ips hostnames > worksheet_hosts
+rm -rf  hosts && cat worksheet_hosts | awk '{ printf("%s %s.{{cnf["domain_name"]}} %s\n", $1, $2, $2) }' > hosts
+# add infra <private IP, fullname, machine name> triplets
+cat infra_ips | awk '{ printf("%s %s.{{cnf["domain_name"]}} %s\n", $3, $1, $1) }' >> hosts
+# set hosts file on workers
+workernum=$(wc -l < fqdns_sorted)
+parallel-scp -t 0 -p $workernum -h fqdns_sorted  -x "-oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null -i ./deploy/sshkey/id_rsa" -l {{cnf["admin_username"]}} hosts /home/{{cnf["admin_username"]}}
+parallel-ssh -o pssh-log/stdout -e pssh-log/stderr -t 0 -p $workernum -h fqdns_sorted -x "-oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null -i deploy/sshkey/id_rsa" -l {{cnf["admin_username"]}} "sudo chmod 666 /etc/hosts && cat /home/{{cnf["admin_username"]}}/hosts >> /etc/hosts && sudo chmod 644 /etc/hosts"
+
+# set hosts file on infras
+infranum=$(wc -l < infra_ips)
+cat infra_ips | awk '{print $2}' > infra_ipv4
+parallel-scp -t 0 -p $infranum -h infra_ipv4  -x "-oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null -i ./deploy/sshkey/id_rsa" -l {{cnf["admin_username"]}} hosts /home/{{cnf["admin_username"]}}
+parallel-ssh -o pssh-log/stdout -e pssh-log/stderr -t 0 -p $infranum -h infra_ipv4 -x "-oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null -i deploy/sshkey/id_rsa" -l {{cnf["admin_username"]}} "sudo chmod 666 /etc/hosts && cat /home/{{cnf["admin_username"]}}/hosts >> /etc/hosts && sudo chmod 644 /etc/hosts"
\ No newline at end of file
diff --git a/src/ClusterBootstrap/template/dns/dns.sh.template b/src/ClusterBootstrap/template/dns/dns.sh.template
index f0d910661..06600d739 100755
--- a/src/ClusterBootstrap/template/dns/dns.sh.template
+++ b/src/ClusterBootstrap/template/dns/dns.sh.template
@@ -2,6 +2,6 @@ sudo systemctl disable systemd-resolved.service
 sudo systemctl stop systemd-resolved
 # echo "dns=default" | sudo tee -a /etc/NetworkManager/NetworkManager.conf
 sudo rm /etc/resolv.conf
-echo "nameserver 8.8.8.8" | sudo tee -a /etc/resolv.conf
-echo 'search {{cnf["azure_cluster"][cnf["cluster_name"]]["azure_location"]}}.cloudapp.azure.com' | sudo tee -a /etc/resolv.conf
+echo "nameserver {{cnf["dns_server"][cnf["platform_type"]]}}" | sudo tee -a /etc/resolv.conf
+echo 'search {{cnf["network_domain"]}}' | sudo tee -a /etc/resolv.conf
 # sudo service network-manager restart
\ No newline at end of file
diff --git a/src/ClusterBootstrap/template/etcd/docker_etcd.sh b/src/ClusterBootstrap/template/etcd/docker_etcd.sh
index 9097f69eb..04a2b1888 100755
--- a/src/ClusterBootstrap/template/etcd/docker_etcd.sh
+++ b/src/ClusterBootstrap/template/etcd/docker_etcd.sh
@@ -6,10 +6,16 @@ docker run -d -v /usr/share/ca-certificates/:/etc/ssl/certs -v /var/etcd:/var/et
  --restart always \
  --name etcd dlws/etcd:3.1.10 /usr/local/bin/etcd \
  -name $HOSTNAME \
- -advertise-client-urls http://{{cnf["etcd_node_ip"]}}:{{cnf["etcd3port1"]}} \
- -listen-client-urls http://0.0.0.0:{{cnf["etcd3port1"]}} \
- -initial-advertise-peer-urls http://{{cnf["etcd_node_ip"]}}:{{cnf["etcd3portserver"]}} \
- -listen-peer-urls http://0.0.0.0:2380 \
+ {% if cnf["etcd_node_num"] == 1 %}-initial-cluster {{cnf["hostname"]}}=https://{{cnf["etcd_private_ip"]}}:{{cnf["etcd3portserver"]}} \
+ -initial-cluster-state new \
+ -initial-cluster-token {{cnf["clusterId"]}} \
+ -advertise-client-urls https://{{cnf["etcd_private_ip"]}}:{{cnf["etcd3port1"]}} \
+ -listen-client-urls https://0.0.0.0:{{cnf["etcd3port1"]}} \
+ -initial-advertise-peer-urls https://{{cnf["etcd_private_ip"]}}:{{cnf["etcd3portserver"]}} \
+ -listen-peer-urls https://0.0.0.0:{{cnf["etcd3portserver"]}} \
+ {% else %}-advertise-client-urls https://{{cnf["etcd_node_ip"]}}:{{cnf["etcd3port1"]}} \
+ -listen-client-urls https://0.0.0.0:{{cnf["etcd3port1"]}} \
+ -initial-advertise-peer-urls https://{{cnf["etcd_node_ip"]}}:{{cnf["etcd3portserver"]}} \
+ -listen-peer-urls https://0.0.0.0:{{cnf["etcd3portserver"]}} \
  -discovery {{cnf["discovery_url"]}} \
- -data-dir /var/etcd/data 
-
+ {% endif %}-data-dir /var/etcd/data \
\ No newline at end of file
diff --git a/src/ClusterBootstrap/template/etcd/docker_etcd_ssl.sh b/src/ClusterBootstrap/template/etcd/docker_etcd_ssl.sh
index 285866890..1d640dd8b 100755
--- a/src/ClusterBootstrap/template/etcd/docker_etcd_ssl.sh
+++ b/src/ClusterBootstrap/template/etcd/docker_etcd_ssl.sh
@@ -9,12 +9,19 @@ docker run -d -v /usr/share/ca-certificates/mozilla:/etc/ssl/certs -v /etc/etcd/
  --restart always \
  --name philly-etcd3 dlws/etcd:3.1.10 /usr/local/bin/etcd \
  -name $HOSTNAME \
- -advertise-client-urls https://{{cnf["etcd_node_ip"]}}:{{cnf["etcd3port1"]}} \
+ {% if cnf["etcd_node_num"] == 1 %}-initial-cluster {{cnf["hostname"]}}=https://{{cnf["etcd_private_ip"]}}:{{cnf["etcd3portserver"]}} \
+ -initial-cluster-state new \
+ -initial-cluster-token {{cnf["clusterId"]}} \
+ -advertise-client-urls https://{{cnf["etcd_private_ip"]}}:{{cnf["etcd3port1"]}} \
+ -listen-client-urls https://0.0.0.0:{{cnf["etcd3port1"]}} \
+ -initial-advertise-peer-urls https://{{cnf["etcd_private_ip"]}}:{{cnf["etcd3portserver"]}} \
+ -listen-peer-urls https://0.0.0.0:{{cnf["etcd3portserver"]}} \
+ {% else %}-advertise-client-urls https://{{cnf["etcd_node_ip"]}}:{{cnf["etcd3port1"]}} \
  -listen-client-urls https://0.0.0.0:{{cnf["etcd3port1"]}} \
  -initial-advertise-peer-urls https://{{cnf["etcd_node_ip"]}}:{{cnf["etcd3portserver"]}} \
  -listen-peer-urls https://0.0.0.0:{{cnf["etcd3portserver"]}} \
  -discovery {{cnf["discovery_url"]}} \
- -data-dir /var/etcd/data \
+ {% endif %}-data-dir /var/etcd/data \
  -client-cert-auth \
  -trusted-ca-file=/etc/etcd/ssl/ca.pem \
  -cert-file=/etc/etcd/ssl/etcd.pem \
@@ -22,4 +29,4 @@ docker run -d -v /usr/share/ca-certificates/mozilla:/etc/ssl/certs -v /etc/etcd/
  -peer-client-cert-auth \
  -peer-trusted-ca-file=/etc/etcd/ssl/ca.pem \
  -peer-cert-file=/etc/etcd/ssl/etcd.pem \
- -peer-key-file=/etc/etcd/ssl/etcd-key.pem
+ -peer-key-file=/etc/etcd/ssl/etcd-key.pem
\ No newline at end of file
diff --git a/src/ClusterBootstrap/template/etcd/etcd3.service b/src/ClusterBootstrap/template/etcd/etcd3.service
index 07aee7caf..5784ebe40 100755
--- a/src/ClusterBootstrap/template/etcd/etcd3.service
+++ b/src/ClusterBootstrap/template/etcd/etcd3.service
@@ -1,28 +1,35 @@
-[Service]
-ExecStart=/usr/bin/docker run -v /usr/share/ca-certificates/mozilla:/etc/ssl/certs -v /etc/etcd/ssl:/etc/etcd/ssl -v /var/etcd:/var/etcd -p {{cnf["etcd3port1"]}}:{{cnf["etcd3port1"]}} -p {{cnf["etcd3portserver"]}}:{{cnf["etcd3portserver"]}} \
-  --net=host \
-  --name etcd3 {{cnf["dockers"]["container"]["etcd"]["fullname"]}} /usr/local/bin/etcd \
-  -name {{cnf["hostname"]}} \
-  -advertise-client-urls https://{{cnf["etcd_node_ip"]}}:{{cnf["etcd3port1"]}} \
-  -listen-client-urls https://0.0.0.0:{{cnf["etcd3port1"]}} \
-  -initial-advertise-peer-urls https://{{cnf["etcd_node_ip"]}}:{{cnf["etcd3portserver"]}} \
-  -listen-peer-urls https://0.0.0.0:{{cnf["etcd3portserver"]}} \
-  -discovery {{cnf["discovery_url"]}} \
-  -data-dir /var/etcd/data \
-  -client-cert-auth \
-  -trusted-ca-file=/etc/etcd/ssl/ca.pem \
-  -cert-file=/etc/etcd/ssl/etcd.pem \
-  -key-file=/etc/etcd/ssl/etcd-key.pem \
-  -peer-client-cert-auth \
-  -peer-trusted-ca-file=/etc/etcd/ssl/ca.pem \
-  -peer-cert-file=/etc/etcd/ssl/etcd.pem \
-  -peer-key-file=/etc/etcd/ssl/etcd-key.pem
-
-
-
-
-Restart=always
-RestartSec=5
-
-[Install]
+[Service]
+ExecStart=/usr/bin/docker run -v /usr/share/ca-certificates/mozilla:/etc/ssl/certs -v /etc/etcd/ssl:/etc/etcd/ssl -v /var/etcd:/var/etcd -p {{cnf["etcd3port1"]}}:{{cnf["etcd3port1"]}} -p {{cnf["etcd3portserver"]}}:{{cnf["etcd3portserver"]}} \
+  --net=host \
+  --name etcd3 {{cnf["dockers"]["container"]["etcd"]["fullname"]}} /usr/local/bin/etcd \
+  -name {{cnf["hostname"]}} \
+  {% if cnf["etcd_node_num"] == 1 %}-initial-cluster {{cnf["hostname"]}}=https://{{cnf["etcd_private_ip"]}}:{{cnf["etcd3portserver"]}} \
+  -initial-cluster-state new \
+  -initial-cluster-token {{cnf["clusterId"]}} \
+  -advertise-client-urls https://{{cnf["etcd_private_ip"]}}:{{cnf["etcd3port1"]}} \
+  -listen-client-urls https://0.0.0.0:{{cnf["etcd3port1"]}} \
+  -initial-advertise-peer-urls https://{{cnf["etcd_private_ip"]}}:{{cnf["etcd3portserver"]}} \
+  -listen-peer-urls https://0.0.0.0:{{cnf["etcd3portserver"]}} \
+  {% else %}-advertise-client-urls https://{{cnf["etcd_node_ip"]}}:{{cnf["etcd3port1"]}} \
+  -listen-client-urls https://0.0.0.0:{{cnf["etcd3port1"]}} \
+  -initial-advertise-peer-urls https://{{cnf["etcd_node_ip"]}}:{{cnf["etcd3portserver"]}} \
+  -listen-peer-urls https://0.0.0.0:{{cnf["etcd3portserver"]}} \
+  -discovery {{cnf["discovery_url"]}} \
+  {% endif %}-data-dir /var/etcd/data \
+  -client-cert-auth \
+  -trusted-ca-file=/etc/etcd/ssl/ca.pem \
+  -cert-file=/etc/etcd/ssl/etcd.pem \
+  -key-file=/etc/etcd/ssl/etcd-key.pem \
+  -peer-client-cert-auth \
+  -peer-trusted-ca-file=/etc/etcd/ssl/ca.pem \
+  -peer-cert-file=/etc/etcd/ssl/etcd.pem \
+  -peer-key-file=/etc/etcd/ssl/etcd-key.pem
+
+
+
+
+Restart=always
+RestartSec=5
+
+[Install]
 WantedBy=multi-user.target
\ No newline at end of file
diff --git a/src/ClusterBootstrap/template/kube-addons/weave.yaml b/src/ClusterBootstrap/template/kube-addons/weave.yaml
index 9ef1d9021..929cd5bb8 100755
--- a/src/ClusterBootstrap/template/kube-addons/weave.yaml
+++ b/src/ClusterBootstrap/template/kube-addons/weave.yaml
@@ -17,7 +17,7 @@ items:
       labels:
         name: weave-net
       namespace: kube-system
-  - apiVersion: rbac.authorization.k8s.io/v1beta1
+  - apiVersion: rbac.authorization.k8s.io/v1
     kind: ClusterRole
     metadata:
       name: weave-net
@@ -58,7 +58,7 @@ items:
         verbs:
           - patch
           - update
-  - apiVersion: rbac.authorization.k8s.io/v1beta1
+  - apiVersion: rbac.authorization.k8s.io/v1
     kind: ClusterRoleBinding
     metadata:
       name: weave-net
@@ -81,7 +81,7 @@ items:
       - kind: ServiceAccount
         name: weave-net
         namespace: kube-system
-  - apiVersion: rbac.authorization.k8s.io/v1beta1
+  - apiVersion: rbac.authorization.k8s.io/v1
     kind: Role
     metadata:
       name: weave-net
diff --git a/src/ClusterBootstrap/template/kubelet/daemon.json b/src/ClusterBootstrap/template/kubelet/daemon.json
index 78fb7af09..6a9bbeb7c 100755
--- a/src/ClusterBootstrap/template/kubelet/daemon.json
+++ b/src/ClusterBootstrap/template/kubelet/daemon.json
@@ -1,4 +1,4 @@
-{% if cnf["azure_cluster"][cnf["cluster_name"]]["worker_vm_size"] in cnf["sku_mapping"] %}
+{% if cnf["gpu_type"] != "None" %}
 {
     "default-runtime": "nvidia",
     "runtimes": {
@@ -10,5 +10,4 @@
 }
 {% else %}
 {}
-{% endif %}
-
+{% endif %}
\ No newline at end of file
diff --git a/src/ClusterBootstrap/template/nfs/nfs_config.sh.template b/src/ClusterBootstrap/template/nfs/nfs_config.sh.template
index fa4ba2b17..64c7d4555 100755
--- a/src/ClusterBootstrap/template/nfs/nfs_config.sh.template
+++ b/src/ClusterBootstrap/template/nfs/nfs_config.sh.template
@@ -1,3 +1,4 @@
+{% if cnf["platform_type"] == "azure_cluster" %}
 sudo parted -l 2>&1 >/dev/null | awk -F': ' '{print $2}' > unlabeled_disk_file
 # Partition
 for disk in `cat unlabeled_disk_file`; do printf "n\n1\n\n\n8e00\nw\nY\n" | sudo gdisk ${disk}; done
@@ -9,6 +10,7 @@ sudo vgcreate dlts-data-lvm ${pv_list}
 sudo lvcreate -l 100%FREE -n dlts-data-lvm-vol1 dlts-data-lvm
 sudo mkfs.ext4 /dev/mapper/dlts--data--lvm-dlts--data--lvm--vol1
 echo "UUID=$(sudo blkid | grep dlts | sed -n 's/.*UUID=\"\(.*\)\" TYPE.*/\1/p')     {{cnf["path"]}}   ext4   defaults,discard      0 0" | sudo tee -a /etc/fstab
+{% endif %}
 sudo mkdir -p {{cnf["path"]}}
 sudo mount {{cnf["path"]}}
 
@@ -20,8 +22,12 @@ sudo apt-get install -y nfs-kernel-server
 sudo mkdir -p {{ fileshare }}
 sudo chmod -R 777 {{ fileshare }}
 sudo chown nobody:nogroup {{fileshare}}
-echo "{{ fileshare }} {{cnf["cloud_config"]["vnet_range"]}}(rw,sync,no_subtree_check,no_root_squash)" | sudo tee -a /etc/exports
-echo "{{ fileshare }} {{cnf["cloud_config"]["samba_range"]}}(rw,fsid=1,nohide,insecure,sync,no_subtree_check,no_root_squash)" | sudo tee -a /etc/exports
+{% for range in cnf["nfs_client_CIDR"]["node_range"] %}
+echo "{{ fileshare }} {{range}}(rw,sync,no_subtree_check,no_root_squash)" | sudo tee -a /etc/exports
+{% endfor %}
+{% for range in cnf["nfs_client_CIDR"]["samba_range"] %}
+echo "{{ fileshare }} {{range}}(rw,fsid=1,nohide,insecure,sync,no_subtree_check,no_root_squash)" | sudo tee -a /etc/exports
+{% endfor %}
 {% endfor %}
 
 # Get number of CPU
diff --git a/src/ClusterBootstrap/template/secret/pass_secret.sh.template b/src/ClusterBootstrap/template/secret/pass_secret.sh.template
index 1f198bc65..ddf9bb32e 100755
--- a/src/ClusterBootstrap/template/secret/pass_secret.sh.template
+++ b/src/ClusterBootstrap/template/secret/pass_secret.sh.template
@@ -1,6 +1,6 @@
 {% for regi_name, regi_cred in cnf["registry_credential"].items() %}   
 docker login {{ regi_name }} -u {{ regi_cred["username"] }} -p {{ regi_cred["password"] }}
 {% endfor %}
-chown -R {{cnf["cloud_config"]["default_admin_username"]}}:{{cnf["cloud_config"]["default_admin_username"]}} /home/{{cnf["cloud_config"]["default_admin_username"]}}/.docker/
-chown -R {{cnf["cloud_config"]["default_admin_username"]}}:{{cnf["cloud_config"]["default_admin_username"]}} /home/{{cnf["cloud_config"]["default_admin_username"]}}/.kube/
-/opt/bin/kubectl create secret generic regcred --from-file=.dockerconfigjson=/home/{{cnf["cloud_config"]["default_admin_username"]}}/.docker/config.json --type=kubernetes.io/dockerconfigjson --dry-run -o yaml | /opt/bin/kubectl apply -f -
\ No newline at end of file
+chown -R {{cnf["admin_username"]}}:{{cnf["admin_username"]}} /home/{{cnf["admin_username"]}}/.docker/
+chown -R {{cnf["admin_username"]}}:{{cnf["admin_username"]}} /home/{{cnf["admin_username"]}}/.kube/
+/opt/bin/kubectl create secret generic regcred --from-file=.dockerconfigjson=/home/{{cnf["admin_username"]}}/.docker/config.json --type=kubernetes.io/dockerconfigjson --dry-run -o yaml | /opt/bin/kubectl apply -f -
\ No newline at end of file
diff --git a/src/ClusterBootstrap/template/vmss/vmss.sh.template b/src/ClusterBootstrap/template/vmss/vmss.sh.template
new file mode 100755
index 000000000..99313c778
--- /dev/null
+++ b/src/ClusterBootstrap/template/vmss/vmss.sh.template
@@ -0,0 +1,23 @@
+az vmss create \
+    --resource-group {{cnf["cluster_name"]}}ResGrp \
+    --name {{cnf["cluster_name"].lower()}}-worker \
+    --image {{cnf["azure_cluster"]["vm_image"]}} \
+    --generate-ssh-keys \
+    --public-ip-address-dns-name {{cnf["cluster_name"].lower()}}-worker \
+    --public-ip-per-vm \
+    --public-ip-address-allocation static \
+    --vm-domain-name {{cnf["cluster_name"].lower()}} \
+    --vm-sku {{cnf["azure_cluster"]["worker_vm_size"]}} \
+    --vnet-name {{cnf["cluster_name"]}}-VNet \
+    --subnet mySubnet \
+    --nsg {{cnf["cluster_name"]}}-nsg \
+    --admin-username {{cnf["admin_username"]}} \
+    --storage-sku Premium_LRS \
+    --data-disk-sizes-gb {{cnf["azure_cluster"]["worker_local_storage_sz"]}} \
+    --data-disk-caching ReadWrite \
+    --ssh-key-values ./deploy/sshkey/id_rsa.pub \
+    --instance-count {{cnf["azure_cluster"]["worker_node_num"]}} \
+    --priority {{cnf["priority"]}} \
+    --disable-overprovision \
+    --eviction-policy {{cnf["azure_cluster"]["eviction_policy"]}} \
+    --single-placement-group {{cnf["azure_cluster"]["single_placement_group"]}}
\ No newline at end of file
diff --git a/src/ClusterBootstrap/utils.py b/src/ClusterBootstrap/utils.py
index 6742b6854..ba636e727 100755
--- a/src/ClusterBootstrap/utils.py
+++ b/src/ClusterBootstrap/utils.py
@@ -351,13 +351,16 @@ def SSH_exec_script( identity_file, user, host, script, supressWarning = False,
 
 
 def get_ETCD_discovery_URL(size):
+    if size == 1:
+            output = "we don't use discovery url for 1 node etcd"
+    else:
         try:
             output = urllib.urlopen("https://discovery.etcd.io/new?size=%d" % size ).read()
             if not "https://discovery.etcd.io" in output:
                 raise Exception("ERROR: we cannot get etcd discovery url from 'https://discovery.etcd.io/new?size=%d', got message %s" % (size,output)) 
         except Exception as e:
             raise Exception("ERROR: we cannot get etcd discovery url from 'https://discovery.etcd.io/new?size=%d'" % size) 
-        return output
+    return output
 
 
 def get_cluster_ID_from_file():
@@ -622,3 +625,44 @@ def mergeDict(configDst, configSrc, bOverwrite):
         elif isinstance(configSrc[entry], dict) and isinstance(configDst[entry], dict):
             mergeDict(configDst[entry], configSrc[entry], bOverwrite)
 
+def ip2int(addr):
+    return struct.unpack("!I", socket.inet_aton(addr))[0]
+
+def mask_num(valid_bit):
+    return int('1'*valid_bit+'0'*(32 - valid_bit), 2)
+
+def remain_num(valid_bit):
+    return int('0'*valid_bit+'1'*(32 - valid_bit), 2)
+
+def check_covered_by_ipvals(ipvals, masked2check):
+    for wider_ipval in ipvals:
+        if wider_ipval == masked2check:
+            return True
+    return False
+
+def check_covered_by_wider_ips(mask2ip, ipval2check, mask4ipval):
+    for msk in mask2ip.keys():
+        # wider mask range
+        if msk < mask4ipval:
+            this_masked = ipval2check & mask_num(msk)
+            if check_covered_by_ipvals(mask2ip[msk], this_masked):
+                return True
+    return False
+
+def keep_widest_subnet(ips):
+    res = set()
+    mask2ip = {}
+    ips = sorted(ips, key = lambda x: int(x[-2:]))
+    for ip in ips:
+        ipv4, mask = ip.split("/")
+        mask = int(mask)
+        ipval = ip2int(ipv4)
+        remnmsk = remain_num(mask)
+        assert (remnmsk & ipval == 0), "invalid ip/mask {}!".format(ip)
+        if check_covered_by_wider_ips(mask2ip, ipval, mask):
+            continue
+        if mask not in mask2ip:
+            mask2ip[mask] = set()
+        mask2ip[mask].add(ipval)
+        res.add(ip)
+    return list(res)
\ No newline at end of file
diff --git a/src/utils/MySQLDataHandler.py b/src/utils/MySQLDataHandler.py
index 4a4395527..a8bfadcc4 100755
--- a/src/utils/MySQLDataHandler.py
+++ b/src/utils/MySQLDataHandler.py
@@ -185,11 +185,10 @@ def CreateTable(self):
             # when the VC has vm of same GPU type but different VMsizes, e.g., when VC has Standard_NC6s_v3 and Standard_NC12s_v3 both?
             # impossible since there's no way to do it with current config mechanism
 
-            worker_cnt = int(config["azure_cluster"]["worker_node_num"])
-            sku_mapping = config["sku_mapping"]
-            sku = sku_mapping.get(config["azure_cluster"]["worker_vm_size"],sku_mapping["default"])
-            n_gpu_pernode = sku["gpu-count"]
-            gpu_type = sku["gpu-type"]
+            gpu_count_per_node = config["gpu_count_per_node"]
+            worker_node_num = config["worker_node_num"]
+            gpu_type = config["gpu_type"]
+
             sql = """
                 CREATE TABLE IF NOT EXISTS  `%s`
                 (
@@ -203,7 +202,7 @@ def CreateTable(self):
                     CONSTRAINT `hierarchy` FOREIGN KEY (`parent`) REFERENCES `%s` (`vcName`)
                 )
                 AS SELECT \'%s\' AS vcName, NULL AS parent, '{\\\"%s\\\":%s}' AS quota, '{\\\"%s\\\":{\\\"num_gpu_per_node\\\":%s}}' AS metadata;
-                """ % (self.vctablename, self.vctablename, config['defalt_virtual_cluster_name'], gpu_type, n_gpu_pernode*worker_cnt, gpu_type,n_gpu_pernode)
+                """ % (self.vctablename, self.vctablename, config['defalt_virtual_cluster_name'], gpu_type, gpu_count_per_node*worker_node_num, gpu_type,gpu_count_per_node)
 
             cursor = self.conn.cursor()
             cursor.execute(sql)