Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Remove paiYarn mode #3327

Merged
merged 40 commits into from
Jan 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
dcd2ffd
Merge pull request #251 from microsoft/master
SparkSnail May 29, 2020
3b8b6fb
Merge pull request #252 from microsoft/master
SparkSnail Jun 7, 2020
916e444
Merge pull request #253 from microsoft/master
SparkSnail Jun 15, 2020
caeffb8
Merge pull request #254 from microsoft/master
SparkSnail Jun 17, 2020
57c300e
Merge pull request #255 from microsoft/master
SparkSnail Jun 28, 2020
65660e6
Merge pull request #257 from microsoft/master
SparkSnail Jun 30, 2020
9376d6a
Merge pull request #258 from microsoft/master
SparkSnail Jul 1, 2020
5fef3cf
Merge pull request #259 from microsoft/master
SparkSnail Jul 3, 2020
5544ae8
Merge pull request #261 from microsoft/master
SparkSnail Jul 10, 2020
f9fdfee
Merge pull request #262 from microsoft/master
SparkSnail Jul 16, 2020
c5e26ef
add trial job detail link
SparkSnail Jul 19, 2020
10a04ba
Merge branch 'master' of https://github.com/SparkSnail/nni
SparkSnail Jul 23, 2020
aa64fe6
Merge pull request #263 from microsoft/master
SparkSnail Jul 27, 2020
4ed907f
Merge branch 'master' of https://github.com/SparkSnail/nni
SparkSnail Jul 27, 2020
c6a5f8c
Merge pull request #264 from microsoft/master
SparkSnail Jul 31, 2020
68abe2f
Merge pull request #265 from microsoft/master
SparkSnail Aug 4, 2020
c2b50d2
Merge branch 'master' of https://github.com/SparkSnail/nni
SparkSnail Aug 6, 2020
14e9619
Merge pull request #266 from microsoft/master
SparkSnail Aug 13, 2020
f69e206
Merge pull request #267 from microsoft/master
SparkSnail Aug 13, 2020
a5bb753
Merge branch 'master' of https://github.com/SparkSnail/nni
SparkSnail Aug 21, 2020
12ef0aa
Merge pull request #270 from microsoft/master
SparkSnail Sep 10, 2020
7600a0f
Merge branch 'master' of https://github.com/SparkSnail/nni
SparkSnail Sep 10, 2020
ddcf229
Merge pull request #271 from microsoft/master
SparkSnail Sep 15, 2020
bd327d4
Merge branch 'master' of https://github.com/SparkSnail/nni
SparkSnail Sep 15, 2020
c4f6e66
Merge pull request #272 from microsoft/master
SparkSnail Sep 21, 2020
da2d1c4
Merge branch 'master' of https://github.com/SparkSnail/nni
SparkSnail Sep 21, 2020
88f8c1b
Merge pull request #273 from microsoft/master
SparkSnail Sep 22, 2020
b59d0e2
Merge branch 'master' of https://github.com/SparkSnail/nni
SparkSnail Sep 22, 2020
7eb15f8
Merge pull request #274 from microsoft/master
SparkSnail Oct 27, 2020
f73367f
Merge pull request #275 from microsoft/master
SparkSnail Nov 16, 2020
765bc33
Merge pull request #276 from microsoft/master
SparkSnail Nov 29, 2020
cff51cc
Merge pull request #277 from microsoft/master
SparkSnail Dec 2, 2020
4232fea
Merge pull request #278 from microsoft/master
SparkSnail Dec 8, 2020
cb9efcc
Merge pull request #279 from microsoft/master
SparkSnail Dec 11, 2020
ee71f16
Merge pull request #280 from microsoft/master
SparkSnail Dec 14, 2020
c3921ed
Merge pull request #281 from microsoft/master
SparkSnail Dec 24, 2020
2acbee3
Merge branch 'master' of https://github.com/SparkSnail/nni
SparkSnail Dec 29, 2020
561f1ad
Merge pull request #284 from microsoft/master
SparkSnail Jan 22, 2021
21afa8c
Merge branch 'master' of https://github.com/SparkSnail/nni
SparkSnail Jan 22, 2021
0809c88
init
SparkSnail Jan 22, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion nni/runtime/platform/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from .standalone import *
elif trial_env_vars.NNI_PLATFORM == 'unittest':
from .test import *
elif trial_env_vars.NNI_PLATFORM in ('local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml', 'adl', 'hybrid'):
elif trial_env_vars.NNI_PLATFORM in ('local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'dlts', 'aml', 'adl', 'hybrid'):
from .local import *
else:
raise RuntimeError('Unknown platform %s' % trial_env_vars.NNI_PLATFORM)
17 changes: 2 additions & 15 deletions nni/tools/nnictl/config_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def validate(self, data):
Optional('maxExecDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$', error='ERROR: maxExecDuration format is [digit]{s,m,h,d}')),
Optional('maxTrialNum'): setNumberRange('maxTrialNum', int, 1, 99999),
'trainingServicePlatform': setChoice(
'trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml', 'adl', 'hybrid'),
'trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'dlts', 'aml', 'adl', 'hybrid'),
Optional('searchSpacePath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'searchSpacePath'),
Optional('multiPhase'): setType('multiPhase', bool),
Optional('multiThread'): setType('multiThread', bool),
Expand Down Expand Up @@ -178,18 +178,6 @@ def validate(self, data):
}
}

pai_yarn_config_schema = {
'paiYarnConfig': Or({
'userName': setType('userName', str),
'passWord': setType('passWord', str),
'host': setType('host', str)
}, {
'userName': setType('userName', str),
'token': setType('token', str),
'host': setType('host', str)
})
}


pai_trial_schema = {
'trial': {
Expand Down Expand Up @@ -456,7 +444,6 @@ def validate(self, data):
'local': Schema({**common_schema, **common_trial_schema}),
'remote': Schema({**common_schema, **common_trial_schema, **machine_list_schema, **remote_config_schema}),
'pai': Schema({**common_schema, **pai_trial_schema, **pai_config_schema}),
'paiYarn': Schema({**common_schema, **pai_yarn_trial_schema, **pai_yarn_config_schema}),
'kubeflow': Schema({**common_schema, **kubeflow_trial_schema, **kubeflow_config_schema}),
'frameworkcontroller': Schema({**common_schema, **frameworkcontroller_trial_schema, **frameworkcontroller_config_schema}),
'aml': Schema({**common_schema, **aml_trial_schema, **aml_config_schema}),
Expand Down Expand Up @@ -569,7 +556,7 @@ def validate_pai_config_path(self, experiment_config):

def validate_pai_trial_conifg(self, experiment_config):
'''validate the trial config in pai platform'''
if experiment_config.get('trainingServicePlatform') in ['pai', 'paiYarn']:
if experiment_config.get('trainingServicePlatform') in ['pai']:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems this if is no longer needed.

if experiment_config.get('trial').get('shmMB') and \
experiment_config['trial']['shmMB'] > experiment_config['trial']['memoryMB']:
raise SchemaError('shmMB should be no more than memoryMB!')
Expand Down
26 changes: 0 additions & 26 deletions nni/tools/nnictl/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,25 +205,6 @@ def set_pai_config(experiment_config, port, config_file_name):
#set trial_config
return set_trial_config(experiment_config, port, config_file_name), err_message

def set_pai_yarn_config(experiment_config, port, config_file_name):
'''set paiYarn configuration'''
pai_yarn_config_data = dict()
pai_yarn_config_data['pai_yarn_config'] = experiment_config['paiYarnConfig']
response = rest_put(cluster_metadata_url(port), json.dumps(pai_yarn_config_data), REST_TIME_OUT)
err_message = None
if not response or not response.status_code == 200:
if response is not None:
err_message = response.text
_, stderr_full_path = get_log_path(config_file_name)
with open(stderr_full_path, 'a+') as fout:
fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':')))
return False, err_message
result, message = setNNIManagerIp(experiment_config, port, config_file_name)
if not result:
return result, message
#set trial_config
return set_trial_config(experiment_config, port, config_file_name), err_message

def set_kubeflow_config(experiment_config, port, config_file_name):
'''set kubeflow configuration'''
kubeflow_config_data = dict()
Expand Down Expand Up @@ -394,11 +375,6 @@ def set_experiment(experiment_config, mode, port, config_file_name):
{'key': 'pai_config', 'value': experiment_config['paiConfig']})
request_data['clusterMetaData'].append(
{'key': 'trial_config', 'value': experiment_config['trial']})
elif experiment_config['trainingServicePlatform'] == 'paiYarn':
request_data['clusterMetaData'].append(
{'key': 'pai_yarn_config', 'value': experiment_config['paiYarnConfig']})
request_data['clusterMetaData'].append(
{'key': 'trial_config', 'value': experiment_config['trial']})
elif experiment_config['trainingServicePlatform'] == 'kubeflow':
request_data['clusterMetaData'].append(
{'key': 'kubeflow_config', 'value': experiment_config['kubeflowConfig']})
Expand Down Expand Up @@ -452,8 +428,6 @@ def set_platform_config(platform, experiment_config, port, config_file_name, res
config_result, err_msg = set_remote_config(experiment_config, port, config_file_name)
elif platform == 'pai':
config_result, err_msg = set_pai_config(experiment_config, port, config_file_name)
elif platform == 'paiYarn':
config_result, err_msg = set_pai_yarn_config(experiment_config, port, config_file_name)
elif platform == 'kubeflow':
config_result, err_msg = set_kubeflow_config(experiment_config, port, config_file_name)
elif platform == 'frameworkcontroller':
Expand Down
2 changes: 0 additions & 2 deletions nni/tools/trial_tool/trial_keeper.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,6 @@ def _handler(signum, frame):
exit(1)
check_version(args)
try:
if NNI_PLATFORM == 'paiYarn' and is_multi_phase():
fetch_parameter_file(args)
if NNI_PLATFORM == 'adl':
_set_adaptdl_signal_handler()
main_loop(args)
Expand Down
16 changes: 0 additions & 16 deletions test/config/training_service.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,22 +59,6 @@ frameworkcontroller:

local:
trainingServicePlatform: local
paiYarn:
nniManagerIp:
maxExecDuration: 15m
paiYarnConfig:
host:
passWord:
userName:
trainingServicePlatform: paiYarn
trial:
gpuNum: 1
cpuNum: 1
dataDir:
image:
memoryMB: 8192
outputDir:
virtualCluster:
pai:
nniManagerIp:
maxExecDuration: 15m
Expand Down
15 changes: 0 additions & 15 deletions test/nni_test/nnitest/generate_ts_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,6 @@ def update_training_service_config(args):
config = get_yml_content(TRAINING_SERVICE_FILE)
if args.nni_manager_ip is not None:
config[args.ts]['nniManagerIp'] = args.nni_manager_ip
if args.ts == 'paiYarn':
if args.pai_user is not None:
config[args.ts]['paiYarnConfig']['userName'] = args.pai_user
if args.pai_pwd is not None:
config[args.ts]['paiYarnConfig']['passWord'] = args.pai_pwd
if args.pai_host is not None:
config[args.ts]['paiYarnConfig']['host'] = args.pai_host
if args.nni_docker_image is not None:
config[args.ts]['trial']['image'] = args.nni_docker_image
if args.data_dir is not None:
config[args.ts]['trial']['dataDir'] = args.data_dir
if args.output_dir is not None:
config[args.ts]['trial']['outputDir'] = args.output_dir
if args.vc is not None:
config[args.ts]['trial']['virtualCluster'] = args.vc
if args.ts == 'pai':
if args.pai_user is not None:
config[args.ts]['paiConfig']['userName'] = args.pai_user
Expand Down
9 changes: 2 additions & 7 deletions ts/nni_manager/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ import { AdlTrainingService } from './training_service/kubernetes/adl/adlTrainin
import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService';
import { LocalTrainingService } from './training_service/local/localTrainingService';
import { RouterTrainingService } from './training_service/reusable/routerTrainingService';
import { PAIYarnTrainingService } from './training_service/pai/paiYarn/paiYarnTrainingService';
import { DLTSTrainingService } from './training_service/dlts/dltsTrainingService';


Expand All @@ -46,10 +45,6 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN
Container.bind(TrainingService)
.to(LocalTrainingService)
.scope(Scope.Singleton);
} else if (platformMode === 'paiYarn') {
Container.bind(TrainingService)
.to(PAIYarnTrainingService)
.scope(Scope.Singleton);
} else if (platformMode === 'kubeflow') {
Container.bind(TrainingService)
.to(KubeflowTrainingService)
Expand Down Expand Up @@ -97,7 +92,7 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN

function usage(): void {
console.info('usage: node main.js --port <port> --mode \
<local/remote/pai/kubeflow/frameworkcontroller/paiYarn/aml/adl/hybrid> --start_mode <new/resume> --experiment_id <id> --foreground <true/false>');
<local/remote/pai/kubeflow/frameworkcontroller/aml/adl/hybrid> --start_mode <new/resume> --experiment_id <id> --foreground <true/false>');
}

const strPort: string = parseArg(['--port', '-p']);
Expand All @@ -117,7 +112,7 @@ const foreground: boolean = foregroundArg.toLowerCase() === 'true' ? true : fals
const port: number = parseInt(strPort, 10);

const mode: string = parseArg(['--mode', '-m']);
if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml', 'adl', 'hybrid'].includes(mode)) {
if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'dlts', 'aml', 'adl', 'hybrid'].includes(mode)) {
console.log(`FATAL: unknown mode: ${mode}`);
usage();
process.exit(1);
Expand Down
35 changes: 35 additions & 0 deletions ts/nni_manager/training_service/pai/paiConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
'use strict';

import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
import {TrialConfig} from '../common/trialConfig';

export class PAIClusterConfig {
public readonly userName: string;
Expand Down Expand Up @@ -71,3 +72,37 @@ export class PAITrialJobDetail implements TrialJobDetail {
this.paiJobDetailUrl = paiJobDetailUrl;
}
}

export const PAI_TRIAL_COMMAND_FORMAT: string =
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
&& NNI_CODE_DIR={6} && mkdir -p $NNI_SYS_DIR/code && cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR/code && sh $NNI_SYS_DIR/install_nni.sh \
&& cd $NNI_SYS_DIR/code && python3 -m nni.tools.trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
--nni_manager_version '{10}' --log_collection '{11}' | tee $NNI_OUTPUT_DIR/trial.log`;

/**
* PAI trial configuration
*/
export class NNIPAITrialConfig extends TrialConfig {
public readonly cpuNum: number;
public readonly memoryMB: number;
public readonly image: string;
public virtualCluster?: string;
public readonly nniManagerNFSMountPath: string;
public readonly containerNFSMountPath: string;
public readonly paiStorageConfigName: string;
public readonly paiConfigPath?: string;

constructor(command: string, codeDir: string, gpuNum: number, cpuNum: number, memoryMB: number,
image: string, nniManagerNFSMountPath: string, containerNFSMountPath: string,
paiStorageConfigName: string, virtualCluster?: string, paiConfigPath?: string) {
super(command, codeDir, gpuNum);
this.cpuNum = cpuNum;
this.memoryMB = memoryMB;
this.image = image;
this.virtualCluster = virtualCluster;
this.nniManagerNFSMountPath = nniManagerNFSMountPath;
this.containerNFSMountPath = containerNFSMountPath;
this.paiStorageConfigName = paiStorageConfigName;
this.paiConfigPath = paiConfigPath;
}
}
33 changes: 0 additions & 33 deletions ts/nni_manager/training_service/pai/paiK8S/paiK8SConfig.ts

This file was deleted.

20 changes: 0 additions & 20 deletions ts/nni_manager/training_service/pai/paiK8S/paiK8SData.ts

This file was deleted.

Loading