Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Pai training service bug fix and enhancement #136

Merged
merged 81 commits into from
Sep 28, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
81 commits
Select commit Hold shift + click to select a range
9a8ac16
PAI Training service implementation, v1
Sep 19, 2018
8983045
update trial package directory in setup.py
Sep 19, 2018
248d0eb
Update setup.py package info
Sep 19, 2018
43fca76
Update trial keeper module, use IP adress for pai training service ma…
Sep 20, 2018
4fe49de
Update metrics file path in reader
Sep 20, 2018
66a54e1
Fix metrics file path issue
Sep 21, 2018
65709d3
Update pai integration, full implementation of pai training service
Sep 21, 2018
c1a3d34
Do not send metrics if it is empty
Sep 21, 2018
232d0e8
Update nnictl, to support pai configuration
Sep 21, 2018
a5d4a20
fix repo
Sep 24, 2018
cd64e5f
add hdfs_output_dir
Sep 24, 2018
889e066
add copy logic
Sep 24, 2018
de9c374
debug
Sep 24, 2018
e98b0ac
update hdfsUtility
Sep 24, 2018
272411a
debug
Sep 24, 2018
4cba4d1
debug
Sep 24, 2018
45d1031
fix setup.py bug
Sep 24, 2018
e63ffc0
fix bug
Sep 24, 2018
954d640
debug
Sep 24, 2018
0410d05
debug
Sep 24, 2018
e3788d2
add exception handler
Sep 25, 2018
793cbf1
fix bug
Sep 25, 2018
b14c108
debug
Sep 25, 2018
0ae9f6d
fix bug
Sep 25, 2018
5938310
fix bug
Sep 25, 2018
c756188
fix bug
Sep 25, 2018
b6ce813
split metrics into single line, and read metrics no matter if subproc…
Sep 25, 2018
dc0f96b
add unit test for hdfsClientUtility
Sep 25, 2018
55b6e08
fix bug
Sep 25, 2018
2529376
Add experiment id in update metrics url to differ trials
Sep 25, 2018
0f7d40c
add default outputdir
Sep 25, 2018
43d7ab7
update
Sep 25, 2018
9c53f47
fix trial_keeper
Sep 25, 2018
beac29c
fix bug
Sep 25, 2018
c54ad7a
add default value for nnioutputdir
Sep 25, 2018
c214362
fix bug
Sep 25, 2018
60bf770
remove unused code
Sep 25, 2018
fad2ba3
PAI Training service implementation, v1 (#1)
yds05 Sep 25, 2018
7f06762
fix conflict
Sep 25, 2018
aa4f306
fix conflict
Sep 25, 2018
45c9600
fix conflict
Sep 25, 2018
24dd1b6
Remove unused import and paiTrialConfig file
Sep 25, 2018
84d278c
Merge branch 'master' into dev-pai
yds05 Sep 25, 2018
9febbd3
Merge pull request #3 from yds05/dev-pai
yds05 Sep 25, 2018
7a43c54
fix conflict
Sep 25, 2018
3e0cce2
refactor code
Sep 26, 2018
ef1eaf8
fix comments
Sep 26, 2018
7f9baea
fix comment
Sep 26, 2018
4af5c60
Implement cancel job API for pai training service
Sep 26, 2018
eb548cf
fix default value for outputDir
Sep 26, 2018
4d24e87
fix comments
Sep 26, 2018
6325bd3
Merge pull request #4 from yds05/dev-pai-desy
yds05 Sep 26, 2018
1db913c
Merge pull request #2 from yds05/dev-pai-t-shya2
SparkSnail Sep 26, 2018
5487975
Merge branch 'master' of https://github.com/Microsoft/nni into Micros…
Sep 26, 2018
88b1876
Merge branch 'Microsoft-master'
Sep 26, 2018
90c9e69
Merge pull request #6 from yds05/master
SparkSnail Sep 26, 2018
b714a8f
fix pip install to master
Sep 26, 2018
b6a233a
change pip install branch in paiData.ts
Sep 27, 2018
9511174
fix conflict
Sep 27, 2018
52b1cc8
fix log path
Sep 27, 2018
76bd378
fix conflict
Sep 27, 2018
c27d146
add logpath logic
Sep 27, 2018
449a4f3
add log path
Sep 27, 2018
1d9f23e
refactor schema
Sep 27, 2018
aa552c0
Fix bug that all trials use the same hdfs log path
Sep 27, 2018
9edfb34
Merge pull request #7 from yds05/dev-pai-t-shya2
yds05 Sep 27, 2018
cb46266
Update PAI training service PR comments
Sep 27, 2018
f09a651
Remove unused nnits-tool in uninstallation
Sep 27, 2018
94c92c3
Remove unused trianing_service_tool package in setup.py
Sep 27, 2018
2eca5d9
Update setup.py version to 0.2.0
Sep 27, 2018
717856e
Change pip install repo to Microsoft/nni
Sep 27, 2018
c32cd52
Update NNI v0.2 release notes
Sep 27, 2018
a549a16
Merge pull request #8 from Microsoft/master
yds05 Sep 27, 2018
76c10e8
Fix typo based on PR comments
Sep 27, 2018
5442251
Add NNI installation scripts
Sep 28, 2018
09fd234
Merge pull request #9 from Microsoft/v0.2
yds05 Sep 28, 2018
73556cc
Update pai script, update NNI_out_dir
Sep 28, 2018
a0da600
Update NNI dir in nni sdk local.py
Sep 28, 2018
8b644d8
Create .nni folder in nni sdk local.py
Sep 28, 2018
fb6e57f
Add check before creating .nni folder
Sep 28, 2018
567bf09
Fix typo for PAI_INSTALL_NNI_SHELL_FORMAT
Sep 28, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions src/nni_manager/training_service/pai/paiData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,21 @@ export class PAITrialJobDetail implements TrialJobDetail {
}
}

export const PAI_INSTALL_NNI_SHELL_FORMAT: string =
`#!/bin/bash
if python3 -c 'import nni' > /dev/null 2>&1; then
# nni module is already installed, skip
return
else
# Install nni
pip3 install -v --user git+https://github.com/Microsoft/nni.git@v0.2
fi`;

export const PAI_TRIAL_COMMAND_FORMAT: string =
`pip3 install -v --user git+https://github.com/Microsoft/nni.git@master
&& export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_EXP_ID={2}
&& cd $NNI_SYS_DIR && mkdir .nni
&& python3 -m trial_tool.trial_keeper --trial_command '{3}' --nnimanager_ip '{4}' --pai_hdfs_output_dir '{5}'
--pai_hdfs_host '{6}' --pai_user_name {7}`;
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3}
&& cd $NNI_SYS_DIR && sh install_nni.sh
&& python3 -m trial_tool.trial_keeper --trial_command '{4}' --nnimanager_ip '{5}' --pai_hdfs_output_dir '{6}'
--pai_hdfs_host '{7}' --pai_user_name {8}`;

export const PAI_OUTPUT_DIR_FORMAT: string =
`hdfs://{0}:9000/`;
Expand Down
7 changes: 6 additions & 1 deletion src/nni_manager/training_service/pai/paiTrainingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ import {
} from '../../common/trainingService';
import { delay, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils';
import { PAIJobRestServer } from './paiJobRestServer'
import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_LOG_PATH_FORMAT } from './paiData';
import { PAITrialJobDetail, PAI_INSTALL_NNI_SHELL_FORMAT, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_LOG_PATH_FORMAT } from './paiData';
import { PAIJobInfoCollector } from './paiJobInfoCollector';
import { String } from 'typescript-string-operations';
import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig';
Expand Down Expand Up @@ -142,6 +142,10 @@ class PAITrainingService implements TrainingService {
//create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${this.paiTrialConfig.codeDir} ${trialLocalTempFolder}`);

const runScriptContent : string = PAI_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' });

// Write file content ( parameter.cfg ) to local tmp folders
const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form)
Expand Down Expand Up @@ -188,6 +192,7 @@ class PAITrainingService implements TrainingService {
PAI_TRIAL_COMMAND_FORMAT,
// PAI will copy job's codeDir into /root directory
`/root/${trialJobId}`,
`/root/${trialJobId}/nnioutput`,
trialJobId,
this.experimentId,
this.paiTrialConfig.command,
Expand Down
12 changes: 7 additions & 5 deletions src/sdk/pynni/nni/platform/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,18 @@

from ..common import init_logger

_sysdir = os.environ['NNI_SYS_DIR']
if not os.path.exists(os.path.join(_sysdir, '.nni')):
os.makedirs(os.path.join(_sysdir, '.nni'))
_metric_file = open(os.path.join(_sysdir, '.nni', 'metrics'), 'wb')

_dir = os.environ['NNI_SYS_DIR']
_metric_file = open(os.path.join(_dir, '.nni', 'metrics'), 'wb')

_log_file_path = os.path.join(_dir, 'trial.log')
_outputdir = os.environ['NNI_OUTPUT_DIR']
_log_file_path = os.path.join(_outputdir, 'trial.log')
init_logger(_log_file_path)


def get_parameters():
params_file = open(os.path.join(_dir, 'parameter.cfg'), 'r')
params_file = open(os.path.join(_sysdir, 'parameter.cfg'), 'r')
return json.load(params_file)

def send_metric(string):
Expand Down
2 changes: 1 addition & 1 deletion tools/trial_tool/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

HOME_DIR = os.path.join(os.environ['HOME'], 'nni')

LOG_DIR = os.path.join(HOME_DIR, 'trial-keeper', 'log')
LOG_DIR = os.environ['NNI_OUTPUT_DIR']

STDOUT_FULL_PATH = os.path.join(LOG_DIR, 'stdout')

Expand Down
7 changes: 5 additions & 2 deletions tools/trial_tool/metrics_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,12 @@ class TrialMetricsReader():
Read metrics data from a trial job
'''
def __init__(self, rest_port = DEFAULT_REST_PORT):
self.offset_filename = os.path.join(NNI_SYS_DIR, '.nni', 'metrics_offset')
self.metrics_filename = os.path.join(NNI_SYS_DIR, '.nni', 'metrics')
metrics_base_dir = os.path.join(NNI_SYS_DIR, '.nni')
self.offset_filename = os.path.join(metrics_base_dir, 'metrics_offset')
self.metrics_filename = os.path.join(metrics_base_dir, 'metrics')
self.rest_port = rest_port
if not os.path.exists(metrics_base_dir):
os.makedirs(metrics_base_dir)

def _metrics_file_is_empty(self):
if not os.path.isfile(self.metrics_filename):
Expand Down