Skip to content
This repository was archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
Add maxTrialDuration (#3863)
Browse files Browse the repository at this point in the history
  • Loading branch information
acured authored Jul 11, 2021
1 parent 3943239 commit dde4d86
Show file tree
Hide file tree
Showing 10 changed files with 52 additions and 2 deletions.
Empty file added 2096200
Empty file.
10 changes: 9 additions & 1 deletion docs/en_US/Tutorial/ExperimentConfig.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ This document describes the rules to write the config file, and provides some ex
* `versionCheck <#versioncheck>`__
* `debug <#debug>`__
* `maxTrialNum <#maxtrialnum>`__
* `maxTrialDuration <#maxtrialduration>`__
* `trainingServicePlatform <#trainingserviceplatform>`__
* `searchSpacePath <#searchspacepath>`__
* `useAnnotation <#useannotation>`__
Expand Down Expand Up @@ -254,7 +255,7 @@ maxExecDuration

Optional. String. Default: 999d.

**maxExecDuration** specifies the max duration time of an experiment. The unit of the time is {**s**\ **m**\ , **h**\ , **d**\ }, which means {*seconds*\ , *minutes*\ , *hours*\ , *days*\ }.
**maxExecDuration** specifies the max duration time of an experiment. The unit of the time is {**s**\ , **m**\ , **h**\ , **d**\ }, which means {*seconds*\ , *minutes*\ , *hours*\ , *days*\ }.

Note: The maxExecDuration spec set the time of an experiment, not a trial job. If the experiment reach the max duration time, the experiment will not stop, but could not submit new trial jobs any more.

Expand All @@ -279,6 +280,13 @@ Optional. Integer between 1 and 99999. Default: 99999.

Specifies the max number of trial jobs created by NNI, including succeeded and failed jobs.

maxTrialDuration
^^^^^^^^^^^^^^^^

Optional. String. Default: 999d.

**maxTrialDuration** specifies the max duration time of each trial job. The unit of the time is {**s**\ , **m**\ , **h**\ , **d**\ }, which means {*seconds*\ , *minutes*\ , *hours*\ , *days*\ }. If current trial job reach the max duration time, this trial job will stop.

trainingServicePlatform
^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
14 changes: 14 additions & 0 deletions docs/en_US/reference/experiment_config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,20 @@ type: ``Optional[int]``
When the budget runs out, the experiment will stop creating trials but continue to serve WebUI.


maxTrialDuration
---------------------

Limit the duration of trial job if specified.

type: ``Optional[str]``

format: ``number + s|m|h|d``

examples: ``"10m"``, ``"0.5h"``

When time runs out, the current trial job will stop.


nniManagerIp
------------

Expand Down
2 changes: 2 additions & 0 deletions nni/experiment/config/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class ExperimentConfig(ConfigBase):
trial_gpu_number: Optional[int] = None # TODO: in openpai cannot be None
max_experiment_duration: Optional[str] = None
max_trial_number: Optional[int] = None
max_trial_duration: Optional[int] = None
nni_manager_ip: Optional[str] = None
use_annotation: bool = False
debug: bool = False
Expand Down Expand Up @@ -153,6 +154,7 @@ def _validation_rules(self):
'trial_gpu_number': lambda value: value >= 0,
'max_experiment_duration': lambda value: util.parse_time(value) > 0,
'max_trial_number': lambda value: value > 0,
'max_trial_duration': lambda value: util.parse_time(value) > 0,
'log_level': lambda value: value in ["trace", "debug", "info", "warning", "error", "fatal"],
'tuner_gpu_indices': lambda value: all(i >= 0 for i in value) and len(value) == len(set(value)),
'training_service': lambda value: (type(value) is not TrainingServiceConfig, 'cannot be abstract base class')
Expand Down
3 changes: 3 additions & 0 deletions nni/experiment/config/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ def to_v2(v1) -> ExperimentConfig:
if isinstance(v2.max_experiment_duration, (int, float)):
v2.max_experiment_duration = str(v2.max_experiment_duration) + 's'
_move_field(v1, v2, 'maxTrialNum', 'max_trial_number')
_move_field(v1, v2, 'maxTrialDuration', 'max_trial_duration')
if isinstance(v2.max_trial_duration, (int, float)):
v2.max_trial_duration = str(v2.max_trial_duration) + 's'
_move_field(v1, v2, 'searchSpacePath', 'search_space_file')
assert not v1.pop('multiPhase', None), 'Multi-phase is no longer supported'
_deprecate(v1, v2, 'multiThread')
Expand Down
1 change: 1 addition & 0 deletions nni/tools/nnictl/config_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ def validate(self, data):
Optional('description'): setType('description', str),
'trialConcurrency': setNumberRange('trialConcurrency', int, 1, 99999),
Optional('maxExecDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$', error='ERROR: maxExecDuration format is [digit]{s,m,h,d}')),
Optional('maxTrialDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$', error='ERROR: maxTrialDuration format is [digit]{s,m,h,d}')),
Optional('maxTrialNum'): setNumberRange('maxTrialNum', int, 1, 99999),
'trainingServicePlatform': setChoice(
'trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'dlts', 'aml', 'adl', 'hybrid'),
Expand Down
1 change: 1 addition & 0 deletions nni/tools/nnictl/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ def set_experiment_v1(experiment_config, mode, port, config_file_name):
request_data['maxExecDuration'] = experiment_config['maxExecDuration']
request_data['maxExperimentDuration'] = str(experiment_config['maxExecDuration']) + 's'
request_data['maxTrialNum'] = experiment_config['maxTrialNum']
request_data['maxTrialDuration'] = experiment_config['maxTrialDuration']
request_data['maxTrialNumber'] = experiment_config['maxTrialNum']
request_data['searchSpace'] = experiment_config.get('searchSpace')
request_data['trainingServicePlatform'] = experiment_config.get('trainingServicePlatform')
Expand Down
4 changes: 4 additions & 0 deletions nni/tools/nnictl/launcher_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ def set_default_values(experiment_config):
experiment_config['maxExecDuration'] = '999d'
if experiment_config.get('maxTrialNum') is None:
experiment_config['maxTrialNum'] = 99999
if experiment_config.get('maxTrialDuration') is None:
experiment_config['maxTrialDuration'] = '999d'
if experiment_config['trainingServicePlatform'] == 'remote' or \
experiment_config['trainingServicePlatform'] == 'hybrid' and \
'remote' in experiment_config['hybridConfig']['trainingServicePlatforms']:
Expand All @@ -126,3 +128,5 @@ def validate_all_content(experiment_config, config_path):

if 'maxExecDuration' in experiment_config:
experiment_config['maxExecDuration'] = parse_time(experiment_config['maxExecDuration'])
if 'maxTrialDuration' in experiment_config:
experiment_config['maxTrialDuration'] = parse_time(experiment_config['maxTrialDuration'])
1 change: 1 addition & 0 deletions ts/nni_manager/common/experimentConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ export interface ExperimentConfig {
trialConcurrency: number;
trialGpuNumber?: number;
maxExperimentDuration?: string;
maxTrialDuration?: string;
maxTrialNumber?: number;
nniManagerIp?: string;
//useAnnotation: boolean; // dealed inside nnictl
Expand Down
18 changes: 17 additions & 1 deletion ts/nni_manager/core/nnimanager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,6 @@ class NNIManager implements Manager {
this.log.debug(`dispatcher command: ${dispatcherCommand}`);
const checkpointDir: string = await this.createCheckpointDir();
this.setupTuner(dispatcherCommand, undefined, 'start', checkpointDir);

this.setStatus('RUNNING');
await this.storeExperimentProfile();
this.run().catch((err: Error) => {
Expand Down Expand Up @@ -433,6 +432,11 @@ class NNIManager implements Manager {
return (value === undefined ? Infinity : value);
}

private get maxTrialDuration(): number {
const value = this.experimentProfile.params.maxTrialDuration;
return (value === undefined ? Infinity : toSeconds(value));
}

private async initTrainingService(config: ExperimentConfig): Promise<TrainingService> {
let platform: string;
if (Array.isArray(config.trainingService)) {
Expand Down Expand Up @@ -539,6 +543,17 @@ class NNIManager implements Manager {
}
}

private async stopTrialJobIfOverMaxDurationTimer(trialJobId: string): Promise<void> {
const trialJobDetail: TrialJobDetail | undefined = this.trialJobs.get(trialJobId);
if(undefined !== trialJobDetail &&
trialJobDetail.status === 'RUNNING' &&
trialJobDetail.startTime !== undefined){
const isEarlyStopped = true;
await this.trainingService.cancelTrialJob(trialJobId, isEarlyStopped);
this.log.info(`Trial job ${trialJobId} has stoped because it is over maxTrialDuration.`);
}
}

private async requestTrialJobsStatus(): Promise<number> {
let finishedTrialJobNum: number = 0;
if (this.dispatcher === undefined) {
Expand Down Expand Up @@ -662,6 +677,7 @@ class NNIManager implements Manager {
this.currSubmittedTrialNum++;
this.log.info('submitTrialJob: form:', form);
const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(form);
setTimeout(async ()=> this.stopTrialJobIfOverMaxDurationTimer(trialJobDetail.id), 1000 * this.maxTrialDuration);
const Snapshot: TrialJobDetail = Object.assign({}, trialJobDetail);
await this.storeExperimentProfile();
this.trialJobs.set(trialJobDetail.id, Snapshot);
Expand Down

0 comments on commit dde4d86

Please sign in to comment.