-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Support remote training service use reuse mode #2923
Changes from 34 commits
dcd2ffd
3b8b6fb
916e444
caeffb8
57c300e
65660e6
9376d6a
5fef3cf
5544ae8
f9fdfee
c5e26ef
10a04ba
aa64fe6
4ed907f
c6a5f8c
68abe2f
c2b50d2
14e9619
f69e206
a5bb753
12ef0aa
7600a0f
ddcf229
bd327d4
c4f6e66
da2d1c4
529c29f
2a386d3
169e65f
88f8c1b
870b2d0
60ff833
c4fa1c3
4e56975
a428853
3b57f94
8d106ba
492ff8e
41e3ebd
9b5b3f7
1dabc88
c68a7f3
abd660c
d998599
c8ec30a
ebc12d2
e772871
863100c
1387f38
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -592,6 +592,14 @@ Specifies the pre-command that will be executed before the remote machine execut | |
|
||
__Note__: Because __preCommand__ will execute before other commands each time, it is strongly not recommended to set __preCommand__ that will make changes to system, i.e. `mkdir` or `touch`. | ||
|
||
### remoteConfig | ||
|
||
Optional field in remote mode. Set remote machine related configuration. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the description is not clear There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated |
||
|
||
#### reuse | ||
|
||
Optional. Set if use trial_runner to maintan multiple trial. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is also not clear. you can describe the benefit when There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated. |
||
|
||
### kubeflowConfig | ||
|
||
#### operator | ||
|
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
// Copyright (c) Microsoft Corporation. | ||
// Licensed under the MIT license. | ||
|
||
import { EnvironmentInformation } from '../environment'; | ||
import { RemoteMachineTrialJobDetail } from '../../remote_machine/remoteMachineData'; | ||
import { TrialJobApplicationForm } from '../../../common/trainingService'; | ||
|
||
|
||
/** | ||
* work around here, need RemoteMachineTrialJobDetail data structure to schedule machines | ||
*/ | ||
export class RemoteMachineMetaDetail extends RemoteMachineTrialJobDetail { | ||
constructor() { | ||
// work around, the form data is a placeholder | ||
const form: TrialJobApplicationForm = { | ||
sequenceId: 0, | ||
hyperParameters: { | ||
value: '', | ||
index: 0 | ||
} | ||
}; | ||
super('', 'WAITING', 1, '', form); | ||
} | ||
} | ||
|
||
/** | ||
* RemoteMachineEnvironmentInformation | ||
*/ | ||
export class RemoteMachineEnvironmentInformation extends EnvironmentInformation { | ||
public rmMachineMetaDetail?: RemoteMachineMetaDetail; | ||
} | ||
|
||
export class RemoteConfig { | ||
public readonly reuse: boolean; | ||
|
||
/** | ||
* Constructor | ||
* @param reuse If job is reusable for multiple trials | ||
*/ | ||
constructor(reuse: boolean) { | ||
this.reuse = reuse; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,10 @@ def main_loop(args): | |
gpu_refresh_last_time = datetime.now() - timedelta(minutes=1) | ||
|
||
try: | ||
if args.job_pid_file: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If use same way like openpai, it doesn't need here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This pid file is used to check trial_runner process status for environment. In openPAI, we use restful API to get trial_runner status, but in remote mode, we need to maintain the pid file to get trial_runner status. |
||
with open(args.job_pid_file, 'w') as job_file: | ||
job_file.write("%d" % os.getpid()) | ||
|
||
trials = dict() | ||
|
||
command_channel = args.command_channel | ||
|
@@ -143,6 +147,7 @@ def check_version(args): | |
PARSER.add_argument('--nni_manager_version', type=str, help='the nni version transmitted from nniManager') | ||
PARSER.add_argument('--log_collection', type=str, help='set the way to collect log in trial runner') | ||
PARSER.add_argument('--node_count', type=int, help='number of nodes, it determines how to consume command and save code file') | ||
PARSER.add_argument('--job_pid_file', type=str, help='save trial runner process pid') | ||
args, unknown = PARSER.parse_known_args() | ||
|
||
setting_file = "settings.json" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it is a little strange to have both "machineList" and "remoteConfig" in the same level
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
machineList is a list type field, I considered merge machineList under remoteConfig, but it may cause compatibility problem.