Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge master #89

Merged
merged 2 commits into from
Dec 7, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/ExperimentConfig.md
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,10 @@ machineList:

__operator__ specify the kubeflow's operator to be used, nni support __tf-operator__ in current version.

* __storage__

__storage__ specify the storage type of kubeflow, including {__nfs__, __azureStorage__}. This field is optional, and the default value is __nfs__. If the config use azureStorage, this field must be completed.

* __nfs__

__server__ is the host of nfs server
Expand Down
2 changes: 2 additions & 0 deletions docs/KubeflowMode.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ trial:
image: {your_docker_image_for_tensorflow_worker}
kubeflowConfig:
operator: tf-operator
storage: nfs
nfs:
server: {your_nfs_server}
path: {your_nfs_server_exported_path}
Expand All @@ -71,6 +72,7 @@ If you use Azure Kubernetes Service, you should set `kubeflowConfig` in your co
```
kubeflowConfig:
operator: tf-operator
storage: azureStorage
keyVault:
vaultName: {your_vault_name}
name: {your_secert_name}
Expand Down
10 changes: 10 additions & 0 deletions src/nni_manager/rest_server/restValidationSchemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,15 @@ export namespace ValidationSchemas {
memoryMB: joi.number().min(100),
gpuNum: joi.number().min(0).required(),
command: joi.string().min(1).required()
}),
master: joi.object({
replicas: joi.number().min(1).required(),
image: joi.string().min(1),
outputDir: joi.string(),
cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100),
gpuNum: joi.number().min(0).required(),
command: joi.string().min(1).required()
})
}),
pai_config: joi.object({
Expand All @@ -68,6 +77,7 @@ export namespace ValidationSchemas {
}),
kubeflow_config: joi.object({
operator: joi.string().min(1).required(),
storage: joi.string().min(1),
nfs: joi.object({
server: joi.string().min(1).required(),
path: joi.string().min(1).required()
Expand Down
74 changes: 57 additions & 17 deletions src/nni_manager/training_service/kubeflow/kubeflowConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,41 +23,63 @@ import { TrialConfig } from "../common/trialConfig";


/** operator types that kubeflow supported */
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' | 'mxnet-operator' | 'caffe2-operator' | 'chainer-operator' | 'mpi-operator';
export type KubeflowOperatorPlural = 'tfjobs' | 'pytorchjobs' | 'mxjobs' | 'caffe2jobs' | 'chainerjobs' | 'mpijobs';
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' ;
export type KubeflowOperatorPlural = 'tfjobs' | 'pytorchjobs' ;
export type KubeflowOperatorJobKind = 'TFJob' | 'PyTorchJob';
export type KubeflowStorageKind = 'nfs' | 'azureStorage';

/**
* map from Kubeflow operator name to its plural name in K8S
*/
export const kubeflowOperatorMap : Map<KubeflowOperator, KubeflowOperatorPlural> = new Map<KubeflowOperator, KubeflowOperatorPlural>([
['tf-operator' , 'tfjobs'],
['pytorch-operator', 'pytorchjobs'],
['mxnet-operator', 'mxjobs'],
['caffe2-operator', 'caffe2jobs'],
['chainer-operator', 'chainerjobs'],
['mpi-operator', 'mpijobs']
['pytorch-operator', 'pytorchjobs']
]);

/**
* map from Kubeflow operator name to its job kind name in K8S
*/
export const kubeflowOperatorJobKindMap : Map<KubeflowOperator, KubeflowOperatorJobKind> = new Map<KubeflowOperator, KubeflowOperatorJobKind>([
['tf-operator' , 'TFJob'],
['pytorch-operator', 'PyTorchJob']
]);

/**
* Kuberflow cluster configuration
*
*/
export class KubeflowClusterConfig {
export class KubeflowClusterConfigBase {
/** Name of Kubeflow operator, like tf-operator */
public readonly operator: KubeflowOperator;
public readonly nfs?: NFSConfig;
public readonly keyVault?: keyVaultConfig;
public readonly azureStorage?: AzureStorage;
public readonly storage?: KubeflowStorageKind;

/**
* Constructor
* @param userName User name of Kubeflow Cluster
* @param passWord password of Kubeflow Cluster
* @param host Host IP of Kubeflow Cluster
*/
constructor(operator: KubeflowOperator, nfs?: NFSConfig, keyVault?: keyVaultConfig, azureStorage ?: AzureStorage) {
constructor(operator: KubeflowOperator, storage?: KubeflowStorageKind) {
this.operator = operator;
this.nfs = nfs;
this.storage = storage;
}
}

export class KubeflowClusterConfigNFS extends KubeflowClusterConfigBase{
public readonly nfs: NFSConfig;

constructor(operator: KubeflowOperator, nfs: NFSConfig, storage?: KubeflowStorageKind) {
super(operator, storage)
this.nfs = nfs;
}
}

export class KubeflowClusterConfigAzure extends KubeflowClusterConfigBase{
public readonly keyVault: keyVaultConfig;
public readonly azureStorage: AzureStorage;

constructor(operator: KubeflowOperator, keyVault: keyVaultConfig, azureStorage: AzureStorage, storage?: KubeflowStorageKind) {
super(operator, storage)
this.keyVault = keyVault;
this.azureStorage = azureStorage;
}
Expand Down Expand Up @@ -142,15 +164,33 @@ export class KubeflowTrialConfigTemplate {
}
}

export class KubeflowTrialConfig {
export class KubeflowTrialConfigBase {
public readonly codeDir: string;

constructor(codeDir: string) {
this.codeDir = codeDir;
}
}

export class KubeflowTrialConfigTensorflow extends KubeflowTrialConfigBase{
public readonly ps?: KubeflowTrialConfigTemplate;
public readonly worker: KubeflowTrialConfigTemplate;

constructor(codeDir: string, worker: KubeflowTrialConfigTemplate, ps?: KubeflowTrialConfigTemplate) {
this.codeDir = codeDir;
this.worker = worker;
constructor(codeDir: string, worker: KubeflowTrialConfigTemplate, ps?: KubeflowTrialConfigTemplate) {
super(codeDir);
this.ps = ps;
this.worker = worker;
}
}

export class KubeflowTrialConfigPytorch extends KubeflowTrialConfigBase{
public readonly master?: KubeflowTrialConfigTemplate;
public readonly worker: KubeflowTrialConfigTemplate;

constructor(codeDir: string, worker: KubeflowTrialConfigTemplate, master?: KubeflowTrialConfigTemplate) {
super(codeDir);
this.master = master;
this.worker = worker;
}
}

Loading