Skip to content

Commit c6a5f8c

Browse files
authored
Merge pull request #264 from microsoft/master
merge master
2 parents aa64fe6 + 143c661 commit c6a5f8c

36 files changed

+1771
-280
lines changed

deployment/docker/Dockerfile

+2-2
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@ RUN python3 -m pip --no-cache-dir install \
4444
numpy==1.14.3 scipy==1.1.0
4545

4646
#
47-
# Tensorflow 1.10.0
47+
# Tensorflow 1.15
4848
#
49-
RUN python3 -m pip --no-cache-dir install tensorflow-gpu==1.10.0
49+
RUN python3 -m pip --no-cache-dir install tensorflow-gpu==1.15
5050

5151
#
5252
# Keras 2.1.6

deployment/docker/README.md

+8-6
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,17 @@ Dockerfile
44
This is the Dockerfile of NNI project. It includes serveral popular deep learning frameworks and NNI. It is tested on `Ubuntu 16.04 LTS`:
55

66
```
7-
CUDA 9.0, CuDNN 7.0
8-
numpy 1.14.3,scipy 1.1.0
9-
TensorFlow-gpu 1.10.0
10-
Keras 2.1.6
11-
PyTorch 0.4.1
7+
CUDA 9.0
8+
CuDNN 7.0
9+
numpy 1.14.3
10+
scipy 1.1.0
11+
tensorflow-gpu 1.15.0
12+
keras 2.1.6
13+
torch 1.4.0
1214
scikit-learn 0.20.0
1315
pandas 0.23.4
1416
lightgbm 2.2.2
15-
NNI v0.7
17+
nni
1618
```
1719
You can take this Dockerfile as a reference for your own customized Dockerfile.
1820

docs/en_US/TrainingService/AMLMode.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ Step 6. Create an AML cluster as the computeTarget.
2222

2323
Step 7. Open a command line and install AML package environment.
2424
```
25-
python3 -m pip install azureml --user
26-
python3 -m pip install azureml-sdk --user
25+
python3 -m pip install azureml
26+
python3 -m pip install azureml-sdk
2727
```
2828

2929
## Run an experiment

src/nni_manager/common/utils.ts

+4-3
Original file line numberDiff line numberDiff line change
@@ -222,15 +222,16 @@ function getIPV4Address(): string {
222222
return cachedipv4Address;
223223
}
224224

225-
if (os.networkInterfaces().eth0) {
226-
for (const item of os.networkInterfaces().eth0) {
225+
const networkInterfaces = os.networkInterfaces();
226+
if (networkInterfaces.eth0) {
227+
for (const item of networkInterfaces.eth0) {
227228
if (item.family === 'IPv4') {
228229
cachedipv4Address = item.address;
229230
return cachedipv4Address;
230231
}
231232
}
232233
} else {
233-
throw Error('getIPV4Address() failed because os.networkInterfaces().eth0 is undefined.');
234+
throw Error(`getIPV4Address() failed because os.networkInterfaces().eth0 is undefined. Please specify NNI manager IP in config.`);
234235
}
235236

236237
throw Error('getIPV4Address() failed because no valid IPv4 address found.')

src/nni_manager/package.json

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
"@types/express": "^4.16.0",
4040
"@types/glob": "^7.1.1",
4141
"@types/js-base64": "^2.3.1",
42+
"@types/js-yaml": "^3.12.5",
4243
"@types/mocha": "^5.2.5",
4344
"@types/node": "10.12.18",
4445
"@types/request": "^2.47.1",

src/nni_manager/rest_server/restValidationSchemas.ts

+5
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,11 @@ export namespace ValidationSchemas {
107107
token: joi.string().min(1),
108108
host: joi.string().min(1).required(),
109109
reuse: joi.boolean(),
110+
cpuNum: joi.number().min(1),
111+
memoryMB: joi.number().min(100),
112+
gpuNum: joi.number().min(1),
113+
maxTrialNumPerGpu: joi.number(),
114+
useActiveGpu: joi.boolean(),
110115
}),
111116
kubeflow_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
112117
operator: joi.string().min(1).required(),

src/nni_manager/training_service/common/gpuData.ts

+24
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,17 @@
33

44
'use strict';
55

6+
export enum ScheduleResultType {
7+
// Schedule succeeded
8+
SUCCEED,
9+
10+
// Temporarily, no enough available GPU right now
11+
TMP_NO_AVAILABLE_GPU,
12+
13+
// Cannot match requirement even if all GPU are a
14+
REQUIRE_EXCEED_TOTAL
15+
}
16+
617
/**
718
* GPU Infromation class
819
* Representing the dynamic and static information retrieved from Nvidia-smi
@@ -52,6 +63,19 @@ export class GPUSummary {
5263
}
5364
}
5465

66+
67+
export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
68+
if (gpuIndices !== undefined) {
69+
const indices: number[] = gpuIndices.split(',')
70+
.map((x: string) => parseInt(x, 10));
71+
if (indices.length > 0) {
72+
return new Set(indices);
73+
} else {
74+
throw new Error('gpuIndices can not be empty if specified.');
75+
}
76+
}
77+
}
78+
5579
export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string =
5680
`
5781
$env:METRIC_OUTPUT_DIR="{0}"

src/nni_manager/training_service/common/trialConfig.ts

+4
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ export class TrialConfig {
1717
// Required GPU number for trial job. The number should be in [0,100]
1818
public readonly gpuNum: number;
1919

20+
// this flag uses for UT now.
21+
// in future, all environments should be reusable, and this can be configurable by user.
22+
public reuseEnvironment: boolean | undefined = true;
23+
2024
/**
2125
* Constructor
2226
* @param command Trail command

src/nni_manager/training_service/pai/paiConfig.ts

+16-3
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
'use strict';
55

6-
import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
6+
import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
77

88
export class PAIClusterConfig {
99
public readonly userName: string;
@@ -12,6 +12,13 @@ export class PAIClusterConfig {
1212
public readonly token?: string;
1313
public readonly reuse?: boolean;
1414

15+
public cpuNum?: number;
16+
public memoryMB?: number;
17+
public gpuNum?: number;
18+
19+
public useActiveGpu?: boolean;
20+
public maxTrialNumPerGpu?: number;
21+
1522
/**
1623
* Constructor
1724
* @param userName User name of PAI Cluster
@@ -20,12 +27,16 @@ export class PAIClusterConfig {
2027
* @param token PAI token of PAI Cluster
2128
* @param reuse If job is reusable for multiple trials
2229
*/
23-
constructor(userName: string, host: string, passWord?: string, token?: string, reuse?: boolean) {
30+
constructor(userName: string, host: string, passWord?: string, token?: string, reuse?: boolean,
31+
cpuNum?: number, memoryMB?: number, gpuNum?: number) {
2432
this.userName = userName;
2533
this.passWord = passWord;
2634
this.host = host;
2735
this.token = token;
2836
this.reuse = reuse;
37+
this.cpuNum = cpuNum;
38+
this.memoryMB = memoryMB;
39+
this.gpuNum = gpuNum;
2940
}
3041
}
3142

@@ -45,9 +56,10 @@ export class PAITrialJobDetail implements TrialJobDetail {
4556
public form: TrialJobApplicationForm;
4657
public logPath: string;
4758
public isEarlyStopped?: boolean;
59+
public paiJobDetailUrl?: string;
4860

4961
constructor(id: string, status: TrialJobStatus, paiJobName: string,
50-
submitTime: number, workingDirectory: string, form: TrialJobApplicationForm, logPath: string) {
62+
submitTime: number, workingDirectory: string, form: TrialJobApplicationForm, logPath: string, paiJobDetailUrl?: string) {
5163
this.id = id;
5264
this.status = status;
5365
this.paiJobName = paiJobName;
@@ -56,5 +68,6 @@ export class PAITrialJobDetail implements TrialJobDetail {
5668
this.form = form;
5769
this.tags = [];
5870
this.logPath = logPath;
71+
this.paiJobDetailUrl = paiJobDetailUrl;
5972
}
6073
}

src/nni_manager/training_service/pai/paiJobInfoCollector.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ export class PAIJobInfoCollector {
8484
if (response.body.jobStatus.appTrackingUrl) {
8585
paiTrialJob.url = response.body.jobStatus.appTrackingUrl;
8686
} else {
87-
paiTrialJob.url = paiTrialJob.logPath;
87+
paiTrialJob.url = paiTrialJob.paiJobDetailUrl;
8888
}
8989
}
9090
break;

src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts

+3-1
Original file line numberDiff line numberDiff line change
@@ -124,14 +124,16 @@ class PAIK8STrainingService extends PAITrainingService {
124124
const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId);
125125
const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`;
126126
const logPath: string = path.join(this.paiTrialConfig.nniManagerNFSMountPath, this.experimentId, trialJobId);
127+
const paiJobDetailUrl: string = `${this.protocol}://${this.paiClusterConfig.host}/job-detail.html?username=${this.paiClusterConfig.userName}&jobName=${paiJobName}`;
127128
const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail(
128129
trialJobId,
129130
'WAITING',
130131
paiJobName,
131132
Date.now(),
132133
trialWorkingFolder,
133134
form,
134-
logPath);
135+
logPath,
136+
paiJobDetailUrl);
135137

136138
this.trialJobsMap.set(trialJobId, trialJobDetail);
137139
this.jobQueue.push(trialJobId);

src/nni_manager/training_service/remote_machine/gpuScheduler.ts

+12-14
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,8 @@
66
import * as assert from 'assert';
77
import { getLogger, Logger } from '../../common/log';
88
import { randomSelect } from '../../common/utils';
9-
import { GPUInfo } from '../common/gpuData';
10-
import {
11-
parseGpuIndices, RemoteMachineMeta, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail, ScheduleResultType, ExecutorManager
12-
} from './remoteMachineData';
9+
import { GPUInfo, parseGpuIndices, ScheduleResultType } from '../common/gpuData';
10+
import { ExecutorManager, RemoteMachineMeta, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail } from './remoteMachineData';
1311

1412
type SCHEDULE_POLICY_NAME = 'random' | 'round-robin';
1513

@@ -39,7 +37,7 @@ export class GPUScheduler {
3937
* @param requiredGPUNum required GPU number
4038
*/
4139
public scheduleMachine(requiredGPUNum: number | undefined, trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
42-
if(requiredGPUNum === undefined) {
40+
if (requiredGPUNum === undefined) {
4341
requiredGPUNum = 0;
4442
}
4543
assert(requiredGPUNum >= 0);
@@ -48,7 +46,7 @@ export class GPUScheduler {
4846

4947
// Step 1: Check if required GPU number not exceeds the total GPU number in all machines
5048
const eligibleRM: RemoteMachineMeta[] = allRMs.filter((rmMeta: RemoteMachineMeta) =>
51-
rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum));
49+
rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum));
5250
if (eligibleRM.length === 0) {
5351
// If the required gpu number exceeds the upper limit of all machine's GPU number
5452
// Return REQUIRE_EXCEED_TOTAL directly
@@ -75,8 +73,8 @@ export class GPUScheduler {
7573
this.log.warning(`Scheduler: trialJob id ${trialJobDetail.id}, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `);
7674

7775
return {
78-
resultType : ScheduleResultType.TMP_NO_AVAILABLE_GPU,
79-
scheduleInfo : undefined
76+
resultType: ScheduleResultType.TMP_NO_AVAILABLE_GPU,
77+
scheduleInfo: undefined
8078
};
8179
}
8280

@@ -159,7 +157,7 @@ export class GPUScheduler {
159157
const num: number | undefined = rmMeta.occupiedGpuIndexMap.get(gpuInfo.index);
160158
const maxTrialNumPerGpu: number = rmMeta.maxTrialNumPerGpu ? rmMeta.maxTrialNumPerGpu : 1;
161159
if ((num === undefined && (!rmMeta.useActiveGpu && gpuInfo.activeProcessNum === 0 || rmMeta.useActiveGpu)) ||
162-
(num !== undefined && num < maxTrialNumPerGpu)) {
160+
(num !== undefined && num < maxTrialNumPerGpu)) {
163161
availableGPUs.push(gpuInfo);
164162
}
165163
} else {
@@ -200,7 +198,7 @@ export class GPUScheduler {
200198
}
201199

202200
private allocateHost(requiredGPUNum: number, rmMeta: RemoteMachineMeta,
203-
gpuInfos: GPUInfo[], trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
201+
gpuInfos: GPUInfo[], trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
204202
assert(gpuInfos.length >= requiredGPUNum);
205203
const allocatedGPUs: GPUInfo[] = this.selectGPUsForTrial(gpuInfos, requiredGPUNum);
206204
allocatedGPUs.forEach((gpuInfo: GPUInfo) => {
@@ -222,10 +220,10 @@ export class GPUScheduler {
222220
scheduleInfo: {
223221
rmMeta: rmMeta,
224222
cudaVisibleDevice: allocatedGPUs
225-
.map((gpuInfo: GPUInfo) => {
226-
return gpuInfo.index;
227-
})
228-
.join(',')
223+
.map((gpuInfo: GPUInfo) => {
224+
return gpuInfo.index;
225+
})
226+
.join(',')
229227
}
230228
};
231229
}

src/nni_manager/training_service/remote_machine/remoteMachineData.ts

+1-24
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
'use strict';
55

66
import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
7-
import { GPUInfo, GPUSummary } from '../common/gpuData';
7+
import { GPUInfo, GPUSummary, ScheduleResultType } from '../common/gpuData';
88
import { ShellExecutor } from './shellExecutor';
99

1010
/**
@@ -25,18 +25,6 @@ export class RemoteMachineMeta {
2525
public readonly useActiveGpu?: boolean = false;
2626
}
2727

28-
export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
29-
if (gpuIndices !== undefined) {
30-
const indices: number[] = gpuIndices.split(',')
31-
.map((x: string) => parseInt(x, 10));
32-
if (indices.length > 0) {
33-
return new Set(indices);
34-
} else {
35-
throw new Error('gpuIndices can not be empty if specified.');
36-
}
37-
}
38-
}
39-
4028
/**
4129
* The execution result for command executed on remote machine
4230
*/
@@ -168,14 +156,3 @@ export class ExecutorManager {
168156
export type RemoteMachineScheduleResult = { scheduleInfo: RemoteMachineScheduleInfo | undefined; resultType: ScheduleResultType };
169157

170158
export type RemoteMachineScheduleInfo = { rmMeta: RemoteMachineMeta; cudaVisibleDevice: string };
171-
172-
export enum ScheduleResultType {
173-
// Schedule succeeded
174-
SUCCEED,
175-
176-
// Temporarily, no enough available GPU right now
177-
TMP_NO_AVAILABLE_GPU,
178-
179-
// Cannot match requirement even if all GPU are a
180-
REQUIRE_EXCEED_TOTAL
181-
}

src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts

+4-5
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import * as assert from 'assert';
77
import { EventEmitter } from 'events';
88
import * as fs from 'fs';
99
import * as path from 'path';
10+
import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
1011
import { Deferred } from 'ts-deferred';
1112
import * as component from '../../common/component';
1213
import { NNIError, NNIErrorNames } from '../../common/errors';
@@ -22,18 +23,16 @@ import {
2223
getVersion, uniqueString
2324
} from '../../common/utils';
2425
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
25-
import { GPUSummary } from '../common/gpuData';
26+
import { GPUSummary, ScheduleResultType } from '../common/gpuData';
2627
import { TrialConfig } from '../common/trialConfig';
2728
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
2829
import { execMkdir, validateCodeDir } from '../common/util';
2930
import { GPUScheduler } from './gpuScheduler';
3031
import {
31-
RemoteMachineMeta,
32-
RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail,
33-
ScheduleResultType, ExecutorManager
32+
ExecutorManager, RemoteMachineMeta,
33+
RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail
3434
} from './remoteMachineData';
3535
import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
36-
import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
3736

3837
/**
3938
* Training Service implementation for Remote Machine (Linux)

0 commit comments

Comments
 (0)