Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
Remove paiYarn mode (#3327)
Browse files Browse the repository at this point in the history
  • Loading branch information
SparkSnail authored Jan 26, 2021
1 parent f11aea0 commit 6f27204
Show file tree
Hide file tree
Showing 21 changed files with 350 additions and 1,469 deletions.
2 changes: 1 addition & 1 deletion nni/runtime/platform/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from .standalone import *
elif trial_env_vars.NNI_PLATFORM == 'unittest':
from .test import *
elif trial_env_vars.NNI_PLATFORM in ('local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml', 'adl', 'hybrid'):
elif trial_env_vars.NNI_PLATFORM in ('local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'dlts', 'aml', 'adl', 'hybrid'):
from .local import *
else:
raise RuntimeError('Unknown platform %s' % trial_env_vars.NNI_PLATFORM)
17 changes: 2 additions & 15 deletions nni/tools/nnictl/config_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def validate(self, data):
Optional('maxExecDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$', error='ERROR: maxExecDuration format is [digit]{s,m,h,d}')),
Optional('maxTrialNum'): setNumberRange('maxTrialNum', int, 1, 99999),
'trainingServicePlatform': setChoice(
'trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml', 'adl', 'hybrid'),
'trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'dlts', 'aml', 'adl', 'hybrid'),
Optional('searchSpacePath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'searchSpacePath'),
Optional('multiPhase'): setType('multiPhase', bool),
Optional('multiThread'): setType('multiThread', bool),
Expand Down Expand Up @@ -178,18 +178,6 @@ def validate(self, data):
}
}

pai_yarn_config_schema = {
'paiYarnConfig': Or({
'userName': setType('userName', str),
'passWord': setType('passWord', str),
'host': setType('host', str)
}, {
'userName': setType('userName', str),
'token': setType('token', str),
'host': setType('host', str)
})
}


pai_trial_schema = {
'trial': {
Expand Down Expand Up @@ -456,7 +444,6 @@ def validate(self, data):
'local': Schema({**common_schema, **common_trial_schema}),
'remote': Schema({**common_schema, **common_trial_schema, **machine_list_schema, **remote_config_schema}),
'pai': Schema({**common_schema, **pai_trial_schema, **pai_config_schema}),
'paiYarn': Schema({**common_schema, **pai_yarn_trial_schema, **pai_yarn_config_schema}),
'kubeflow': Schema({**common_schema, **kubeflow_trial_schema, **kubeflow_config_schema}),
'frameworkcontroller': Schema({**common_schema, **frameworkcontroller_trial_schema, **frameworkcontroller_config_schema}),
'aml': Schema({**common_schema, **aml_trial_schema, **aml_config_schema}),
Expand Down Expand Up @@ -569,7 +556,7 @@ def validate_pai_config_path(self, experiment_config):

def validate_pai_trial_conifg(self, experiment_config):
'''validate the trial config in pai platform'''
if experiment_config.get('trainingServicePlatform') in ['pai', 'paiYarn']:
if experiment_config.get('trainingServicePlatform') in ['pai']:
if experiment_config.get('trial').get('shmMB') and \
experiment_config['trial']['shmMB'] > experiment_config['trial']['memoryMB']:
raise SchemaError('shmMB should be no more than memoryMB!')
Expand Down
26 changes: 0 additions & 26 deletions nni/tools/nnictl/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,25 +205,6 @@ def set_pai_config(experiment_config, port, config_file_name):
#set trial_config
return set_trial_config(experiment_config, port, config_file_name), err_message

def set_pai_yarn_config(experiment_config, port, config_file_name):
'''set paiYarn configuration'''
pai_yarn_config_data = dict()
pai_yarn_config_data['pai_yarn_config'] = experiment_config['paiYarnConfig']
response = rest_put(cluster_metadata_url(port), json.dumps(pai_yarn_config_data), REST_TIME_OUT)
err_message = None
if not response or not response.status_code == 200:
if response is not None:
err_message = response.text
_, stderr_full_path = get_log_path(config_file_name)
with open(stderr_full_path, 'a+') as fout:
fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':')))
return False, err_message
result, message = setNNIManagerIp(experiment_config, port, config_file_name)
if not result:
return result, message
#set trial_config
return set_trial_config(experiment_config, port, config_file_name), err_message

def set_kubeflow_config(experiment_config, port, config_file_name):
'''set kubeflow configuration'''
kubeflow_config_data = dict()
Expand Down Expand Up @@ -395,11 +376,6 @@ def set_experiment(experiment_config, mode, port, config_file_name):
{'key': 'pai_config', 'value': experiment_config['paiConfig']})
request_data['clusterMetaData'].append(
{'key': 'trial_config', 'value': experiment_config['trial']})
elif experiment_config['trainingServicePlatform'] == 'paiYarn':
request_data['clusterMetaData'].append(
{'key': 'pai_yarn_config', 'value': experiment_config['paiYarnConfig']})
request_data['clusterMetaData'].append(
{'key': 'trial_config', 'value': experiment_config['trial']})
elif experiment_config['trainingServicePlatform'] == 'kubeflow':
request_data['clusterMetaData'].append(
{'key': 'kubeflow_config', 'value': experiment_config['kubeflowConfig']})
Expand Down Expand Up @@ -453,8 +429,6 @@ def set_platform_config(platform, experiment_config, port, config_file_name, res
config_result, err_msg = set_remote_config(experiment_config, port, config_file_name)
elif platform == 'pai':
config_result, err_msg = set_pai_config(experiment_config, port, config_file_name)
elif platform == 'paiYarn':
config_result, err_msg = set_pai_yarn_config(experiment_config, port, config_file_name)
elif platform == 'kubeflow':
config_result, err_msg = set_kubeflow_config(experiment_config, port, config_file_name)
elif platform == 'frameworkcontroller':
Expand Down
2 changes: 0 additions & 2 deletions nni/tools/trial_tool/trial_keeper.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,6 @@ def _handler(signum, frame):
exit(1)
check_version(args)
try:
if NNI_PLATFORM == 'paiYarn' and is_multi_phase():
fetch_parameter_file(args)
if NNI_PLATFORM == 'adl':
_set_adaptdl_signal_handler()
main_loop(args)
Expand Down
16 changes: 0 additions & 16 deletions test/config/training_service.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,22 +59,6 @@ frameworkcontroller:

local:
trainingServicePlatform: local
paiYarn:
nniManagerIp:
maxExecDuration: 15m
paiYarnConfig:
host:
passWord:
userName:
trainingServicePlatform: paiYarn
trial:
gpuNum: 1
cpuNum: 1
dataDir:
image:
memoryMB: 8192
outputDir:
virtualCluster:
pai:
nniManagerIp:
maxExecDuration: 15m
Expand Down
15 changes: 0 additions & 15 deletions test/nni_test/nnitest/generate_ts_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,6 @@ def update_training_service_config(args):
config = get_yml_content(TRAINING_SERVICE_FILE)
if args.nni_manager_ip is not None:
config[args.ts]['nniManagerIp'] = args.nni_manager_ip
if args.ts == 'paiYarn':
if args.pai_user is not None:
config[args.ts]['paiYarnConfig']['userName'] = args.pai_user
if args.pai_pwd is not None:
config[args.ts]['paiYarnConfig']['passWord'] = args.pai_pwd
if args.pai_host is not None:
config[args.ts]['paiYarnConfig']['host'] = args.pai_host
if args.nni_docker_image is not None:
config[args.ts]['trial']['image'] = args.nni_docker_image
if args.data_dir is not None:
config[args.ts]['trial']['dataDir'] = args.data_dir
if args.output_dir is not None:
config[args.ts]['trial']['outputDir'] = args.output_dir
if args.vc is not None:
config[args.ts]['trial']['virtualCluster'] = args.vc
if args.ts == 'pai':
if args.pai_user is not None:
config[args.ts]['paiConfig']['userName'] = args.pai_user
Expand Down
9 changes: 2 additions & 7 deletions ts/nni_manager/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ import { AdlTrainingService } from './training_service/kubernetes/adl/adlTrainin
import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService';
import { LocalTrainingService } from './training_service/local/localTrainingService';
import { RouterTrainingService } from './training_service/reusable/routerTrainingService';
import { PAIYarnTrainingService } from './training_service/pai/paiYarn/paiYarnTrainingService';
import { DLTSTrainingService } from './training_service/dlts/dltsTrainingService';


Expand All @@ -46,10 +45,6 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN
Container.bind(TrainingService)
.to(LocalTrainingService)
.scope(Scope.Singleton);
} else if (platformMode === 'paiYarn') {
Container.bind(TrainingService)
.to(PAIYarnTrainingService)
.scope(Scope.Singleton);
} else if (platformMode === 'kubeflow') {
Container.bind(TrainingService)
.to(KubeflowTrainingService)
Expand Down Expand Up @@ -97,7 +92,7 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN

function usage(): void {
console.info('usage: node main.js --port <port> --mode \
<local/remote/pai/kubeflow/frameworkcontroller/paiYarn/aml/adl/hybrid> --start_mode <new/resume> --experiment_id <id> --foreground <true/false>');
<local/remote/pai/kubeflow/frameworkcontroller/aml/adl/hybrid> --start_mode <new/resume> --experiment_id <id> --foreground <true/false>');
}

const strPort: string = parseArg(['--port', '-p']);
Expand All @@ -117,7 +112,7 @@ const foreground: boolean = foregroundArg.toLowerCase() === 'true' ? true : fals
const port: number = parseInt(strPort, 10);

const mode: string = parseArg(['--mode', '-m']);
if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml', 'adl', 'hybrid'].includes(mode)) {
if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'dlts', 'aml', 'adl', 'hybrid'].includes(mode)) {
console.log(`FATAL: unknown mode: ${mode}`);
usage();
process.exit(1);
Expand Down
35 changes: 35 additions & 0 deletions ts/nni_manager/training_service/pai/paiConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
'use strict';

import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
import {TrialConfig} from '../common/trialConfig';

export class PAIClusterConfig {
public readonly userName: string;
Expand Down Expand Up @@ -71,3 +72,37 @@ export class PAITrialJobDetail implements TrialJobDetail {
this.paiJobDetailUrl = paiJobDetailUrl;
}
}

export const PAI_TRIAL_COMMAND_FORMAT: string =
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
&& NNI_CODE_DIR={6} && mkdir -p $NNI_SYS_DIR/code && cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR/code && sh $NNI_SYS_DIR/install_nni.sh \
&& cd $NNI_SYS_DIR/code && python3 -m nni.tools.trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
--nni_manager_version '{10}' --log_collection '{11}' | tee $NNI_OUTPUT_DIR/trial.log`;

/**
* PAI trial configuration
*/
export class NNIPAITrialConfig extends TrialConfig {
public readonly cpuNum: number;
public readonly memoryMB: number;
public readonly image: string;
public virtualCluster?: string;
public readonly nniManagerNFSMountPath: string;
public readonly containerNFSMountPath: string;
public readonly paiStorageConfigName: string;
public readonly paiConfigPath?: string;

constructor(command: string, codeDir: string, gpuNum: number, cpuNum: number, memoryMB: number,
image: string, nniManagerNFSMountPath: string, containerNFSMountPath: string,
paiStorageConfigName: string, virtualCluster?: string, paiConfigPath?: string) {
super(command, codeDir, gpuNum);
this.cpuNum = cpuNum;
this.memoryMB = memoryMB;
this.image = image;
this.virtualCluster = virtualCluster;
this.nniManagerNFSMountPath = nniManagerNFSMountPath;
this.containerNFSMountPath = containerNFSMountPath;
this.paiStorageConfigName = paiStorageConfigName;
this.paiConfigPath = paiConfigPath;
}
}
33 changes: 0 additions & 33 deletions ts/nni_manager/training_service/pai/paiK8S/paiK8SConfig.ts

This file was deleted.

20 changes: 0 additions & 20 deletions ts/nni_manager/training_service/pai/paiK8S/paiK8SData.ts

This file was deleted.

Loading

0 comments on commit 6f27204

Please sign in to comment.