Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Support kuberflow pytorch-operator #406

Merged
merged 216 commits into from
Dec 7, 2018
Merged
Show file tree
Hide file tree
Changes from 215 commits
Commits
Show all changes
216 commits
Select commit Hold shift + click to select a range
dc780cd
Merge pull request #1 from Microsoft/master
SparkSnail Sep 14, 2018
86243e7
Merge pull request #2 from Microsoft/master
SparkSnail Sep 14, 2018
3d1e4e9
fix nnictl bug
Sep 14, 2018
6d09780
Merge pull request #4 from Microsoft/master
SparkSnail Sep 17, 2018
0d24158
Merge branch 'master' of https://github.com/SparkSnail/nni
Sep 18, 2018
6d669c6
Merge pull request #6 from Microsoft/master
SparkSnail Sep 19, 2018
af2615d
Merge pull request #8 from Microsoft/master
SparkSnail Sep 20, 2018
f6b7c0a
Merge pull request #9 from Microsoft/master
SparkSnail Sep 24, 2018
a74febc
Merge pull request #10 from Microsoft/master
SparkSnail Sep 25, 2018
334b0a4
Merge pull request #12 from Microsoft/master
SparkSnail Sep 27, 2018
efe93df
Merge pull request #13 from Microsoft/master
SparkSnail Sep 27, 2018
0d9b074
Merge branch 'master' of https://github.com/SparkSnail/nni
Sep 28, 2018
421ad1a
Merge pull request #16 from Microsoft/master
SparkSnail Sep 30, 2018
660a8f8
Merge branch 'master' of https://github.com/SparkSnail/nni
Sep 30, 2018
2b01089
fix install.sh
Sep 30, 2018
951e80e
Merge pull request #17 from Microsoft/master
SparkSnail Oct 1, 2018
90fe674
Merge pull request #18 from Microsoft/master
SparkSnail Oct 7, 2018
2ccf0ed
Merge pull request #19 from Microsoft/master
SparkSnail Oct 8, 2018
77aacee
Merge pull request #20 from Microsoft/master
SparkSnail Oct 8, 2018
9e23dfe
Merge pull request #22 from Microsoft/master
SparkSnail Oct 8, 2018
ca7bbe4
Merge pull request #24 from Microsoft/master
SparkSnail Oct 10, 2018
346badd
add desc for Dockerfile.build.base
Oct 10, 2018
4af27d6
Merge pull request #27 from Microsoft/master
SparkSnail Oct 11, 2018
46a8350
update document for Dockerfile
Oct 11, 2018
4e3697f
Merge pull request #29 from Microsoft/master
SparkSnail Oct 12, 2018
4cd95aa
Merge pull request #30 from Microsoft/master
SparkSnail Oct 15, 2018
405ce45
Merge pull request #31 from Microsoft/master
SparkSnail Oct 15, 2018
c3949e6
Merge pull request #32 from Microsoft/master
SparkSnail Oct 16, 2018
22c78fd
Merge pull request #33 from Microsoft/master
SparkSnail Oct 16, 2018
a870817
update
Oct 16, 2018
b45268c
refactor port detect
Oct 16, 2018
59626ec
update
Oct 16, 2018
31ea28b
Merge pull request #34 from Microsoft/master
SparkSnail Oct 16, 2018
2ca84c5
refactor NNICTLDOC.md
Oct 17, 2018
ab02c93
add document for pai and nnictl
Oct 17, 2018
5ff7b45
add default value for port
Oct 17, 2018
c1e835d
Merge pull request #35 from Microsoft/master
SparkSnail Oct 18, 2018
5ae146d
add exception handling in trial_keeper.py
Oct 18, 2018
1dde461
fix port bug
Oct 18, 2018
fe6a188
Merge pull request #36 from Microsoft/master
SparkSnail Oct 18, 2018
f7a5228
Merge pull request #37 from Microsoft/master
SparkSnail Oct 19, 2018
9fdf6d4
fix resume
Oct 19, 2018
c1285f8
fix nnictl resume and fix nnictl stop
Oct 19, 2018
af0d081
fix document
Oct 19, 2018
7ce8fd8
update
Oct 19, 2018
b29aaed
refactor nnictl
Oct 19, 2018
683833b
update
Oct 19, 2018
6149bf9
update doc
Oct 22, 2018
73bef2f
update
Oct 22, 2018
5c397f6
update nnictl
Oct 23, 2018
2c68171
fix comment
Oct 23, 2018
d0659d6
fix conflict
Oct 23, 2018
e64abbf
Merge branch 'Microsoft-master'
Oct 23, 2018
a70f7e7
Merge pull request #39 from Microsoft/master
SparkSnail Oct 23, 2018
8d14ca9
revert dockerfile
Oct 23, 2018
aeb7c66
update
Oct 23, 2018
a256549
update
Oct 23, 2018
3fb0bca
update
Oct 23, 2018
e116af4
Merge pull request #40 from Microsoft/master
SparkSnail Oct 23, 2018
f4edebc
Merge pull request #41 from Microsoft/master
SparkSnail Oct 24, 2018
9413d77
Merge pull request #42 from Microsoft/master
SparkSnail Oct 25, 2018
c0663e8
Merge pull request #43 from Microsoft/master
SparkSnail Oct 25, 2018
357545b
Merge pull request #44 from Microsoft/master
SparkSnail Oct 26, 2018
e8dec33
Merge pull request #45 from Microsoft/master
SparkSnail Oct 27, 2018
46bf7b4
fix nnictl error hit
Oct 30, 2018
7fd0ac3
Merge pull request #46 from Microsoft/master
SparkSnail Oct 30, 2018
4cc7d92
Merge branch 'master' of https://github.com/SparkSnail/nni
Oct 30, 2018
55701a2
fix comments
Oct 30, 2018
25d1c22
Merge pull request #47 from Microsoft/master
SparkSnail Oct 30, 2018
b6b45d1
fix bash-completion
Oct 30, 2018
04c03a4
fix paramiko install
Oct 31, 2018
72f0e45
Merge pull request #48 from Microsoft/master
SparkSnail Oct 31, 2018
7c91af9
quick fix resume logic
Oct 31, 2018
26ef462
update
Oct 31, 2018
8237228
Merge pull request #49 from Microsoft/master
SparkSnail Oct 31, 2018
3404415
Merge pull request #52 from Microsoft/master
SparkSnail Nov 1, 2018
cbee322
fix nnictl in master
Nov 1, 2018
0d67eca
update
Nov 1, 2018
a982ed0
update
Nov 1, 2018
5d44f7b
update
Nov 1, 2018
88072dc
update
Nov 1, 2018
12a3e21
Merge pull request #53 from Microsoft/master
SparkSnail Nov 1, 2018
e4b2082
remove paramiko
Nov 2, 2018
53167b4
update
Nov 2, 2018
384a159
Merge pull request #57 from Microsoft/master
SparkSnail Nov 2, 2018
96ad3ed
refactor requirement.txt of sklearn
Nov 2, 2018
cb9d74e
Merge branch 'master' into master
SparkSnail Nov 2, 2018
8a026cc
update
Nov 2, 2018
369a303
Merge branch 'master' of https://github.com/SparkSnail/nni
Nov 2, 2018
7499de7
fix readme.md
Nov 2, 2018
78ee31c
update
Nov 2, 2018
e7245f8
update
Nov 2, 2018
775701a
update
Nov 2, 2018
1520a3e
fix conflict
Nov 5, 2018
0a3c103
Merge branch 'Microsoft-master'
Nov 5, 2018
4dbbb90
fix conflict
Nov 5, 2018
3eb8ea0
Merge branch 'Microsoft-master'
Nov 5, 2018
c34809d
add HowToContribute.md
Nov 5, 2018
cfca9f2
add port in experiment list
Nov 7, 2018
f31b27b
fix localTrainingService stop logic
Nov 7, 2018
84fe396
support pai port
Nov 7, 2018
b12ed90
update
Nov 7, 2018
0ebbb29
Merge pull request #61 from Microsoft/master
SparkSnail Nov 9, 2018
5123ab1
update
Nov 9, 2018
9771511
update
Nov 9, 2018
66cecf4
Merge pull request #62 from Microsoft/master
SparkSnail Nov 9, 2018
618860a
add sklearn version
Nov 9, 2018
bd4f431
add pytorch in dockerfile
Nov 12, 2018
24c7a8c
fix conflict
Nov 12, 2018
cac435e
fix conflict
Nov 12, 2018
c742f38
Merge branch 'Microsoft-master'
Nov 12, 2018
0a8e3ab
update
Nov 12, 2018
d4e41f8
update
Nov 12, 2018
1a6b7e6
fix conflict
Nov 12, 2018
9cbe353
Merge branch 'Microsoft-master'
Nov 12, 2018
88d33ea
Merge pull request #65 from Microsoft/master
SparkSnail Nov 12, 2018
36c6bc2
update
Nov 13, 2018
ac6beb5
Merge pull request #66 from Microsoft/master
SparkSnail Nov 13, 2018
9ffbe8d
fix Dockerfile
Nov 13, 2018
a39c8f3
fix conflict
Nov 13, 2018
c280ba5
Merge branch 'Microsoft-master'
Nov 13, 2018
898b469
Merge pull request #68 from Microsoft/master
SparkSnail Nov 14, 2018
050d485
Merge pull request #69 from Microsoft/master
SparkSnail Nov 14, 2018
2bcca69
fix nnictl stop
Nov 14, 2018
1fc4369
update
Nov 16, 2018
0f6b466
update
Nov 16, 2018
5106381
update
Nov 16, 2018
9c8ffb5
Merge pull request #70 from Microsoft/master
SparkSnail Nov 19, 2018
f170be6
fix nnictl classArgs
Nov 19, 2018
3238fa3
Kubeflow TrainingService support, v1 (#373)
yds05 Nov 19, 2018
0ee46f5
Fix nni stop (#368)
SparkSnail Nov 16, 2018
12db0cb
Add more tooltips in default metric graph (#370)
lvybriage Nov 19, 2018
8657d53
Update README.md (#371)
yds05 Nov 19, 2018
23a75a4
Kubeflow training service: Check nfs configuration and throw error if…
yds05 Nov 19, 2018
6878c4c
Merge pull request #71 from Microsoft/master
SparkSnail Nov 20, 2018
a8fc806
Merge branch 'kubeflow' of https://github.com/Microsoft/nni into kube…
Nov 20, 2018
34936df
Merge pull request #72 from Microsoft/master
SparkSnail Nov 20, 2018
515ce09
Merge branch 'master' of https://github.com/SparkSnail/nni into kubeflow
Nov 20, 2018
65f64d8
add azure storage for aks
Nov 20, 2018
b6ee4ab
update
Nov 20, 2018
710bd8f
update
Nov 20, 2018
b019754
update
Nov 21, 2018
fb70b7e
Merge pull request #73 from Microsoft/master
SparkSnail Nov 22, 2018
1b84d81
Merge branch 'master' of https://github.com/SparkSnail/nni into kubeflow
Nov 22, 2018
8103ff0
Merge pull request #74 from Microsoft/master
SparkSnail Nov 23, 2018
5b9278c
update
Nov 23, 2018
5805c2a
update
Nov 23, 2018
7f13e2b
Merge pull request #75 from Microsoft/master
SparkSnail Nov 23, 2018
ca6bc19
remove pip install from azure-pipeline.yml
Nov 23, 2018
b59eef4
Merge pull request #76 from Microsoft/master
SparkSnail Nov 23, 2018
f8f76f0
fix conflict
Nov 23, 2018
c316f57
fix conflict
Nov 23, 2018
d877522
Merge pull request #77 from Microsoft/master
SparkSnail Nov 23, 2018
f822751
fix conflict
Nov 23, 2018
9ad9568
Merge pull request #78 from Microsoft/master
SparkSnail Nov 26, 2018
892d0ca
Merge branch 'kubeflow' of https://github.com/SparkSnail/nni into pyt…
Nov 26, 2018
cb0e5f4
support pytorch dist
Nov 26, 2018
8ce338a
update
Nov 26, 2018
3dc19d7
update
Nov 26, 2018
e4de3eb
debug
Nov 26, 2018
2972664
debug
Nov 26, 2018
5a16ce6
debug
Nov 26, 2018
a37a888
debug
Nov 26, 2018
0caff4f
debug
Nov 26, 2018
81737d5
update
Nov 26, 2018
d13b138
refactor createDirectory func name
Nov 26, 2018
47aeed3
update
Nov 26, 2018
afc0a0f
update
Nov 27, 2018
4b431cf
add regex for rest validation
Nov 27, 2018
fd70ad3
update
Nov 27, 2018
2244979
update
Nov 27, 2018
2e05bce
refactor code by comments
Nov 27, 2018
85418b3
add type for azureStorageClient
Nov 27, 2018
760e317
break long lines
Nov 27, 2018
6ef56e6
update
Nov 27, 2018
c3de123
remove empty lines
Nov 27, 2018
fa0a011
update
Nov 27, 2018
1d1535b
fix conflict
Nov 27, 2018
1eacda9
update
Nov 27, 2018
a0371e9
Merge pull request #79 from Microsoft/master
SparkSnail Nov 28, 2018
765c278
fix conflict
Nov 28, 2018
40c8a9c
update
Nov 28, 2018
f90bf48
update launcher
Nov 28, 2018
8b862de
update
Nov 28, 2018
c22c0f7
update
Nov 28, 2018
e1ad56b
Merge pull request #80 from Microsoft/master
SparkSnail Nov 29, 2018
2782f09
Merge branch 'master' of https://github.com/SparkSnail/nni into pytor…
Nov 29, 2018
a463fe2
refactor nnictl
Nov 29, 2018
ab47184
set kubernetesServer as optional
Nov 29, 2018
2d9ddad
Merge pull request #81 from Microsoft/master
SparkSnail Nov 29, 2018
545ad8c
Merge pull request #82 from Microsoft/master
SparkSnail Nov 30, 2018
e39610c
Merge branch 'master' of https://github.com/SparkSnail/nni into pytor…
Nov 30, 2018
4494310
Merge pull request #83 from Microsoft/master
SparkSnail Dec 3, 2018
955f9f8
fix conflict
Dec 3, 2018
9d8c60a
refactor pytorch
Dec 3, 2018
64f547b
update token
Dec 3, 2018
ec08bc2
Merge pull request #84 from Microsoft/master
SparkSnail Dec 3, 2018
28f615d
Merge pull request #85 from Microsoft/master
SparkSnail Dec 4, 2018
9859baa
add annotation
Dec 4, 2018
249a74b
Merge pull request #87 from Microsoft/master
SparkSnail Dec 6, 2018
b5f4c51
fix conflict
Dec 6, 2018
939309e
fix error
Dec 6, 2018
676c03e
update
Dec 6, 2018
028acdb
update
Dec 6, 2018
1a6df9b
revert paiTrainingService
Dec 6, 2018
1ae112a
remove blank lines
Dec 6, 2018
d2aa86d
update
Dec 6, 2018
e6360eb
update
Dec 6, 2018
2d4c15d
update
Dec 7, 2018
cc94cb8
update
Dec 7, 2018
9b4ea5f
update
Dec 7, 2018
74d5c11
update
Dec 7, 2018
b85c31d
fix generateRunScript
Dec 7, 2018
571b989
update
Dec 7, 2018
a0125ca
update
Dec 7, 2018
ddb896d
update
Dec 7, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/nni_manager/rest_server/restValidationSchemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,15 @@ export namespace ValidationSchemas {
memoryMB: joi.number().min(100),
gpuNum: joi.number().min(0).required(),
command: joi.string().min(1).required()
}),
master: joi.object({
replicas: joi.number().min(1).required(),
image: joi.string().min(1),
outputDir: joi.string(),
cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100),
gpuNum: joi.number().min(0).required(),
command: joi.string().min(1).required()
})
}),
pai_config: joi.object({
Expand All @@ -68,6 +77,7 @@ export namespace ValidationSchemas {
}),
kubeflow_config: joi.object({
operator: joi.string().min(1).required(),
storage: joi.string().min(1),
nfs: joi.object({
server: joi.string().min(1).required(),
path: joi.string().min(1).required()
Expand Down
74 changes: 57 additions & 17 deletions src/nni_manager/training_service/kubeflow/kubeflowConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,41 +23,63 @@ import { TrialConfig } from "../common/trialConfig";


/** operator types that kubeflow supported */
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' | 'mxnet-operator' | 'caffe2-operator' | 'chainer-operator' | 'mpi-operator';
export type KubeflowOperatorPlural = 'tfjobs' | 'pytorchjobs' | 'mxjobs' | 'caffe2jobs' | 'chainerjobs' | 'mpijobs';
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' ;
export type KubeflowOperatorPlural = 'tfjobs' | 'pytorchjobs' ;
export type KubeflowOperatorJobKind = 'TFJob' | 'PyTorchJob';
export type KubeflowStorageKind = 'nfs' | 'azureStorage';

/**
* map from Kubeflow operator name to its plural name in K8S
*/
export const kubeflowOperatorMap : Map<KubeflowOperator, KubeflowOperatorPlural> = new Map<KubeflowOperator, KubeflowOperatorPlural>([
['tf-operator' , 'tfjobs'],
['pytorch-operator', 'pytorchjobs'],
['mxnet-operator', 'mxjobs'],
['caffe2-operator', 'caffe2jobs'],
['chainer-operator', 'chainerjobs'],
['mpi-operator', 'mpijobs']
['pytorch-operator', 'pytorchjobs']
]);

/**
* map from Kubeflow operator name to its job kind name in K8S
*/
export const kubeflowOperatorJobKindMap : Map<KubeflowOperator, KubeflowOperatorJobKind> = new Map<KubeflowOperator, KubeflowOperatorJobKind>([
['tf-operator' , 'TFJob'],
['pytorch-operator', 'PyTorchJob']
]);

/**
* Kuberflow cluster configuration
*
*/
export class KubeflowClusterConfig {
export class KubeflowClusterConfigBase {
/** Name of Kubeflow operator, like tf-operator */
public readonly operator: KubeflowOperator;
public readonly nfs?: NFSConfig;
public readonly keyVault?: keyVaultConfig;
public readonly azureStorage?: AzureStorage;
public readonly storage?: KubeflowStorageKind;

/**
* Constructor
* @param userName User name of Kubeflow Cluster
* @param passWord password of Kubeflow Cluster
* @param host Host IP of Kubeflow Cluster
*/
constructor(operator: KubeflowOperator, nfs?: NFSConfig, keyVault?: keyVaultConfig, azureStorage ?: AzureStorage) {
constructor(operator: KubeflowOperator, storage?: KubeflowStorageKind) {
this.operator = operator;
this.nfs = nfs;
this.storage = storage;
}
}

export class KubeflowClusterConfigNFS extends KubeflowClusterConfigBase{
public readonly nfs: NFSConfig;

constructor(operator: KubeflowOperator, nfs: NFSConfig, storage?: KubeflowStorageKind) {
super(operator, storage)
this.nfs = nfs;
}
}

export class KubeflowClusterConfigAzure extends KubeflowClusterConfigBase{
public readonly keyVault: keyVaultConfig;
public readonly azureStorage: AzureStorage;

constructor(operator: KubeflowOperator, keyVault: keyVaultConfig, azureStorage: AzureStorage, storage?: KubeflowStorageKind) {
super(operator, storage)
this.keyVault = keyVault;
this.azureStorage = azureStorage;
}
Expand Down Expand Up @@ -142,15 +164,33 @@ export class KubeflowTrialConfigTemplate {
}
}

export class KubeflowTrialConfig {
export class KubeflowTrialConfigBase {
public readonly codeDir: string;

constructor(codeDir: string) {
this.codeDir = codeDir;
}
}

export class KubeflowTrialConfigTensorflow extends KubeflowTrialConfigBase{
public readonly ps?: KubeflowTrialConfigTemplate;
public readonly worker: KubeflowTrialConfigTemplate;

constructor(codeDir: string, worker: KubeflowTrialConfigTemplate, ps?: KubeflowTrialConfigTemplate) {
this.codeDir = codeDir;
this.worker = worker;
constructor(codeDir: string, worker: KubeflowTrialConfigTemplate, ps?: KubeflowTrialConfigTemplate) {
super(codeDir);
this.ps = ps;
this.worker = worker;
}
}

export class KubeflowTrialConfigPytorch extends KubeflowTrialConfigBase{
public readonly master?: KubeflowTrialConfigTemplate;
public readonly worker: KubeflowTrialConfigTemplate;

constructor(codeDir: string, worker: KubeflowTrialConfigTemplate, master?: KubeflowTrialConfigTemplate) {
super(codeDir);
this.master = master;
this.worker = worker;
}
}

Loading