Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

fix kubeflow pipeline #4767

Merged
merged 29 commits into from
Apr 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
4779d7f
add namespace for kubeflow
J-shang Apr 13, 2022
dc1c7da
update
J-shang Apr 13, 2022
a3b4270
update
J-shang Apr 13, 2022
f18b2f3
update pipeline
J-shang Apr 13, 2022
e276f5e
fix kubeflow reuse mode
J-shang Apr 13, 2022
5bee4af
extend aml waiting time
J-shang Apr 14, 2022
afa8865
fix comments & remove resnet18 from cifar10 search space
J-shang Apr 14, 2022
85acc83
fix comments
J-shang Apr 14, 2022
596438c
revert aml port waiting time & fix bug
J-shang Apr 14, 2022
fa91271
fix card link
J-shang Apr 14, 2022
6cccfb1
fix comments
J-shang Apr 14, 2022
8d9eddf
Merge branch 'v2.7' into namespace
J-shang Apr 14, 2022
4231de8
fix comments
J-shang Apr 14, 2022
6f28569
Merge branch 'namespace' of https://github.com/J-shang/nni into names…
J-shang Apr 14, 2022
de3b660
update chinese link
J-shang Apr 14, 2022
ece7e38
fix framework controller
J-shang Apr 14, 2022
f262e5c
update fc reuse
J-shang Apr 14, 2022
1090416
update pipeline config
J-shang Apr 14, 2022
578eeed
update print trial log
J-shang Apr 14, 2022
9a88a3e
unify trial log folder
J-shang Apr 14, 2022
952eb13
disable frameworkbarrier
J-shang Apr 15, 2022
c2db770
test openpai
J-shang Apr 15, 2022
d99caab
update
J-shang Apr 15, 2022
fd710e0
rm test code
J-shang Apr 16, 2022
6f97880
add comments for disable frameworkbarrier
J-shang Apr 18, 2022
3c7f759
Merge remote-tracking branch 'upstream/v2.7' into namespace
J-shang Apr 18, 2022
77bfd8c
update release date
J-shang Apr 18, 2022
35aa66a
revert frameworkbarrier
J-shang Apr 18, 2022
30b65dc
revert test code
J-shang Apr 18, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ NNI makes AutoML techniques plug-and-play
.. codesnippetcard::
:icon: ../img/thumbnails/quantization-small.svg
:title: Quantization
:link: tutorials/quantization_speedup
:link: tutorials/quantization_quick_start_mnist

.. code-block::

Expand Down
3 changes: 2 additions & 1 deletion docs/source/index_zh.rst
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.. f2a86f83def6c4b2e35ba50ce2487deb
.. dbd41cab307bcd76cc747b3d478709b8


NNI 文档
=================
Expand Down
2 changes: 1 addition & 1 deletion docs/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
Change Log
==========

Release 2.7 - 4/14/2022
Release 2.7 - 4/18/2022
-----------------------

Documentation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class FrameworkControllerConfig(TrainingServiceConfig):
service_account_name: Optional[str]
task_roles: List[FrameworkControllerRoleConfig]
reuse_mode: Optional[bool] = True
namespace: str = 'default'

def _canonicalize(self, parents):
super()._canonicalize(parents)
Expand Down
1 change: 1 addition & 0 deletions nni/experiment/config/training_services/kubeflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class KubeflowConfig(TrainingServiceConfig):
ps: Optional[KubeflowRoleConfig] = None
master: Optional[KubeflowRoleConfig] = None
reuse_mode: Optional[bool] = True #set reuse mode as true for v2 config
namespace: str = 'default'

def _canonicalize(self, parents):
super()._canonicalize(parents)
Expand Down
2 changes: 2 additions & 0 deletions nni/tools/nnictl/config_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ def validate(self, data):
'path': setType('path', str)
},
Optional('reuse'): setType('reuse', bool),
Optional('namespace'): setType('namespace', str),
}, {
'operator': setChoice('operator', 'tf-operator', 'pytorch-operator'),
'apiVersion': setType('apiVersion', str),
Expand All @@ -377,6 +378,7 @@ def validate(self, data):
},
Optional('uploadRetryCount'): setNumberRange('uploadRetryCount', int, 1, 99999),
Optional('reuse'): setType('reuse', bool),
Optional('namespace'): setType('namespace', str),
})
}

Expand Down
2 changes: 1 addition & 1 deletion test/config/examples/cifar10_search_space.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"lr":{"_type":"choice", "_value":[0.1, 0.01, 0.001, 0.0001]},
"optimizer":{"_type":"choice", "_value":["SGD", "Adadelta", "Adagrad", "Adam", "Adamax"]},
"model":{"_type":"choice", "_value":["vgg", "resnet18"]}
"model":{"_type":"choice", "_value":["vgg"]}
liuzhe-lz marked this conversation as resolved.
Show resolved Hide resolved
}
4 changes: 3 additions & 1 deletion test/config/training_service.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ kubeflow:
azureStorage:
accountName:
azureShare:
namespace: kubeflow
trial:
worker:
replicas: 1
Expand All @@ -35,14 +36,15 @@ frameworkcontroller:
maxTrialNum: 2
trialConcurrency: 2
frameworkcontrollerConfig:
serviceAccountName: frameworkbarrier
serviceAccountName: frameworkcontroller
storage: azureStorage
keyVault:
vaultName:
name:
azureStorage:
accountName:
azureShare:
namespace: kubeflow
trial:
taskRoles:
- name: worker
Expand Down
2 changes: 2 additions & 0 deletions test/config/training_service_v2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ kubeflow:
trainingService:
reuseMode: true
platform: kubeflow
namespace: kubeflow
worker:
command:
code_directory:
Expand All @@ -44,6 +45,7 @@ frameworkcontroller:
trainingService:
reuseMode: true
platform: frameworkcontroller
namespace: kubeflow
serviceAccountName: frameworkcontroller
taskRoles:
- name: worker
Expand Down
14 changes: 10 additions & 4 deletions test/nni_test/nnitest/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,18 @@ def print_file_content(filepath):
print(content, flush=True)

def print_trial_job_log(training_service, trial_jobs_url):
trial_jobs = get_trial_jobs(trial_jobs_url)
for trial_job in trial_jobs:
trial_log_dir = os.path.join(get_experiment_dir(EXPERIMENT_URL), 'trials', trial_job['trialJobId'])
trial_log_root = os.path.join(get_experiment_dir(EXPERIMENT_URL), 'trials')
if not os.path.exists(trial_log_root):
print('trial log folder does not exist: {}'.format(trial_log_root), flush=True)
return
folders = os.listdir(trial_log_root)
for name in folders:
trial_log_dir = os.path.join(trial_log_root, name)
log_files = ['stderr', 'trial.log'] if training_service == 'local' else ['stdout_log_collection.log']
for log_file in log_files:
print_file_content(os.path.join(trial_log_dir, log_file))
log_file_path = os.path.join(trial_log_dir, log_file)
if os.path.exists(log_file_path):
print_file_content(log_file_path)

def print_experiment_log(experiment_id):
log_dir = get_nni_log_dir(experiment_id=experiment_id)
Expand Down
3 changes: 2 additions & 1 deletion ts/nni_manager/common/experimentConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ export interface KubeflowConfig extends TrainingServiceConfig {
master?: KubeflowRoleConfig;
reuseMode: boolean;
maxTrialNumberPerGpu?: number;
namespace?: string;
}

export interface FrameworkControllerTaskRoleConfig {
Expand All @@ -156,7 +157,7 @@ export interface FrameworkControllerConfig extends TrainingServiceConfig {
taskRoles: FrameworkControllerTaskRoleConfig[];
reuseMode: boolean;
maxTrialNumberPerGpu?: number;
namespace?: 'default';
namespace?: string;
apiVersion?: string;
}

Expand Down
3 changes: 2 additions & 1 deletion ts/nni_manager/config/aml/amlUtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@
print('stop_result:failed')
exit(0)
loop_count += 1
time.sleep(500)
time.sleep(5)
liuzhe-lz marked this conversation as resolved.
Show resolved Hide resolved
status = run.get_status()
print('stop_result:success')
exit(0)
elif line == 'receive':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class AdlClientV1 extends KubernetesCRDClient {
/**
* constructor, to initialize adl CRD definition
*/
protected readonly namespace: string;
public readonly namespace: string;

public constructor(namespace: string) {
super();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
} else {
configTaskRoles = this.parseCustomTaskRoles(this.fcTemplate.spec.taskRoles)
}
const namespace = this.fcClusterConfig.namespace ? this.fcClusterConfig.namespace : "default";
const namespace = this.fcClusterConfig.namespace ?? "default";
this.genericK8sClient.setNamespace = namespace;

if (this.kubernetesRestServerPort === undefined) {
Expand All @@ -134,7 +134,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
const trialJobId: string = uniqueString(5);
// Set trial's NFS working folder
const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId);
const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials', trialJobId);
let frameworkcontrollerJobName: string = `nniexp${this.experimentId}trial${trialJobId}`.toLowerCase();

let frameworkcontrollerJobConfig: any;
Expand Down Expand Up @@ -204,6 +204,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
let namespace: string | undefined;
this.fcClusterConfig = FrameworkControllerClusterConfigFactory
.generateFrameworkControllerClusterConfig(frameworkcontrollerClusterJsonObject);
this.genericK8sClient.setNamespace = this.fcClusterConfig.namespace ?? "default";
if (this.fcClusterConfig.storageType === 'azureStorage') {
const azureFrameworkControllerClusterConfig: FrameworkControllerClusterConfigAzure =
<FrameworkControllerClusterConfigAzure>this.fcClusterConfig;
Expand Down Expand Up @@ -346,8 +347,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
for (const taskRole of configTaskRoles) {
const runScriptContent: string =
await this.generateRunScript('frameworkcontroller', trialJobId, trialWorkingFolder,
this.generateCommandScript(configTaskRoles, taskRole.command), form.sequenceId.toString(),
taskRole.name, taskRole.gpuNum ? taskRole.gpuNum : 0);
this.generateCommandScript(configTaskRoles, taskRole.command),
form.sequenceId.toString(), taskRole.name, taskRole.gpuNum ? taskRole.gpuNum : 0);
await fs.promises.writeFile(path.join(trialLocalTempFolder, `run_${taskRole.name}.sh`), runScriptContent, {encoding: 'utf8'});
}

Expand Down Expand Up @@ -439,7 +440,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
kind: 'Framework',
metadata: {
name: frameworkcontrollerJobName,
namespace: this.fcClusterConfig.namespace ? this.fcClusterConfig.namespace : "default",
namespace: this.fcClusterConfig.namespace ?? "default",
labels: {
app: this.NNI_KUBERNETES_TRIAL_LABEL,
expId: getExperimentId(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class TFOperatorClientV1Alpha2 extends KubernetesCRDClient {
}

protected get operator(): any {
return this.client.apis['kubeflow.org'].v1alpha2.namespaces('default').tfjobs;
return this.client.apis['kubeflow.org'].v1alpha2.namespaces(this.namespace).tfjobs;
}

public get containerName(): string {
Expand All @@ -36,7 +36,7 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient {
}

protected get operator(): any {
return this.client.apis['kubeflow.org'].v1beta1.namespaces('default').tfjobs;
return this.client.apis['kubeflow.org'].v1beta1.namespaces(this.namespace).tfjobs;
}

public get containerName(): string {
Expand All @@ -55,7 +55,7 @@ class TFOperatorClientV1Beta2 extends KubernetesCRDClient {
}

protected get operator(): any {
return this.client.apis['kubeflow.org'].v1beta2.namespaces('default').tfjobs;
return this.client.apis['kubeflow.org'].v1beta2.namespaces(this.namespace).tfjobs;
}

public get containerName(): string {
Expand All @@ -74,7 +74,7 @@ class TFOperatorClientV1 extends KubernetesCRDClient {
}

protected get operator(): any {
return this.client.apis['kubeflow.org'].v1.namespaces('default').tfjobs;
return this.client.apis['kubeflow.org'].v1.namespaces(this.namespace).tfjobs;
}

public get containerName(): string {
Expand All @@ -92,7 +92,7 @@ class PyTorchOperatorClientV1 extends KubernetesCRDClient {
}

protected get operator(): any {
return this.client.apis['kubeflow.org'].v1.namespaces('default').pytorchjobs;
return this.client.apis['kubeflow.org'].v1.namespaces(this.namespace).pytorchjobs;
}

public get containerName(): string {
Expand All @@ -110,7 +110,7 @@ class PyTorchOperatorClientV1Alpha2 extends KubernetesCRDClient {
}

protected get operator(): any {
return this.client.apis['kubeflow.org'].v1alpha2.namespaces('default').pytorchjobs;
return this.client.apis['kubeflow.org'].v1alpha2.namespaces(this.namespace).pytorchjobs;
}

public get containerName(): string {
Expand All @@ -129,7 +129,7 @@ class PyTorchOperatorClientV1Beta1 extends KubernetesCRDClient {
}

protected get operator(): any {
return this.client.apis['kubeflow.org'].v1beta1.namespaces('default').pytorchjobs;
return this.client.apis['kubeflow.org'].v1beta1.namespaces(this.namespace).pytorchjobs;
}

public get containerName(): string {
Expand All @@ -148,7 +148,7 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient {
}

protected get operator(): any {
return this.client.apis['kubeflow.org'].v1beta2.namespaces('default').pytorchjobs;
return this.client.apis['kubeflow.org'].v1beta2.namespaces(this.namespace).pytorchjobs;
}

public get containerName(): string {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ export type OperatorApiVersion = 'v1alpha2' | 'v1beta1' | 'v1beta2' | 'v1';
*/
export class KubeflowClusterConfig extends KubernetesClusterConfig {
public readonly operator: KubeflowOperator;
constructor(apiVersion: string, operator: KubeflowOperator) {
super(apiVersion);
constructor(apiVersion: string, operator: KubeflowOperator, namespace?: string) {
super(apiVersion, undefined, namespace);
this.operator = operator;
}
}
Expand All @@ -30,9 +30,10 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
operator: KubeflowOperator,
apiVersion: string,
nfs: NFSConfig,
storage?: KubernetesStorageKind
storage?: KubernetesStorageKind,
namespace?: string
) {
super(apiVersion, nfs, storage);
super(apiVersion, nfs, storage, namespace);
this.operator = operator;
}

Expand All @@ -48,7 +49,8 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
kubeflowClusterConfigObjectNFS.operator,
kubeflowClusterConfigObjectNFS.apiVersion,
kubeflowClusterConfigObjectNFS.nfs,
kubeflowClusterConfigObjectNFS.storage
kubeflowClusterConfigObjectNFS.storage,
kubeflowClusterConfigObjectNFS.namespace
);
}
}
Expand All @@ -61,9 +63,10 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure {
apiVersion: string,
keyVault: KeyVaultConfig,
azureStorage: AzureStorage,
storage?: KubernetesStorageKind
storage?: KubernetesStorageKind,
namespace?: string
) {
super(apiVersion, keyVault, azureStorage, storage);
super(apiVersion, keyVault, azureStorage, storage, undefined, namespace);
this.operator = operator;
}

Expand All @@ -79,7 +82,8 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure {
kubeflowClusterConfigObjectAzure.apiVersion,
kubeflowClusterConfigObjectAzure.keyVault,
kubeflowClusterConfigObjectAzure.azureStorage,
kubeflowClusterConfigObjectAzure.storage
kubeflowClusterConfigObjectAzure.storage,
kubeflowClusterConfigObjectAzure.namespace
);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ export class KubeflowJobRestServer extends KubernetesJobRestServer {
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
constructor() {
super(component.get(KubeflowTrainingService));
constructor(kubeflowTrainingService: KubeflowTrainingService) {
super(kubeflowTrainingService);
}
}
Loading