Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Support kubeflow reuse mode #3919

Merged
merged 48 commits into from
Jul 26, 2021
Merged
Show file tree
Hide file tree
Changes from 47 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
dcd2ffd
Merge pull request #251 from microsoft/master
SparkSnail May 29, 2020
3b8b6fb
Merge pull request #252 from microsoft/master
SparkSnail Jun 7, 2020
916e444
Merge pull request #253 from microsoft/master
SparkSnail Jun 15, 2020
caeffb8
Merge pull request #254 from microsoft/master
SparkSnail Jun 17, 2020
57c300e
Merge pull request #255 from microsoft/master
SparkSnail Jun 28, 2020
65660e6
Merge pull request #257 from microsoft/master
SparkSnail Jun 30, 2020
9376d6a
Merge pull request #258 from microsoft/master
SparkSnail Jul 1, 2020
5fef3cf
Merge pull request #259 from microsoft/master
SparkSnail Jul 3, 2020
5544ae8
Merge pull request #261 from microsoft/master
SparkSnail Jul 10, 2020
f9fdfee
Merge pull request #262 from microsoft/master
SparkSnail Jul 16, 2020
aa64fe6
Merge pull request #263 from microsoft/master
SparkSnail Jul 27, 2020
c6a5f8c
Merge pull request #264 from microsoft/master
SparkSnail Jul 31, 2020
68abe2f
Merge pull request #265 from microsoft/master
SparkSnail Aug 4, 2020
14e9619
Merge pull request #266 from microsoft/master
SparkSnail Aug 13, 2020
f69e206
Merge pull request #267 from microsoft/master
SparkSnail Aug 13, 2020
12ef0aa
Merge pull request #270 from microsoft/master
SparkSnail Sep 10, 2020
ddcf229
Merge pull request #271 from microsoft/master
SparkSnail Sep 15, 2020
c4f6e66
Merge pull request #272 from microsoft/master
SparkSnail Sep 21, 2020
88f8c1b
Merge pull request #273 from microsoft/master
SparkSnail Sep 22, 2020
7eb15f8
Merge pull request #274 from microsoft/master
SparkSnail Oct 27, 2020
f73367f
Merge pull request #275 from microsoft/master
SparkSnail Nov 16, 2020
765bc33
Merge pull request #276 from microsoft/master
SparkSnail Nov 29, 2020
cff51cc
Merge pull request #277 from microsoft/master
SparkSnail Dec 2, 2020
4232fea
Merge pull request #278 from microsoft/master
SparkSnail Dec 8, 2020
cb9efcc
Merge pull request #279 from microsoft/master
SparkSnail Dec 11, 2020
ee71f16
Merge pull request #280 from microsoft/master
SparkSnail Dec 14, 2020
c3921ed
Merge pull request #281 from microsoft/master
SparkSnail Dec 24, 2020
561f1ad
Merge pull request #284 from microsoft/master
SparkSnail Jan 22, 2021
daf028a
Merge pull request #285 from microsoft/master
SparkSnail Feb 5, 2021
9a8a4a3
Merge pull request #286 from microsoft/master
SparkSnail Feb 8, 2021
22a38dd
Merge pull request #287 from microsoft/master
SparkSnail Feb 23, 2021
645e1a6
Merge pull request #288 from microsoft/master
SparkSnail Feb 24, 2021
f41c25d
Merge pull request #289 from microsoft/master
SparkSnail Feb 25, 2021
9fb5ff9
Merge pull request #290 from microsoft/master
SparkSnail Mar 4, 2021
e3fab14
Merge pull request #291 from microsoft/master
SparkSnail Mar 23, 2021
ad26f40
Merge pull request #292 from microsoft/master
SparkSnail Apr 12, 2021
ad78613
Merge pull request #294 from microsoft/master
SparkSnail Apr 30, 2021
5453841
Merge pull request #295 from microsoft/master
SparkSnail May 7, 2021
09f977e
Merge pull request #296 from microsoft/master
SparkSnail May 19, 2021
fdb2d77
set version check as warning
SparkSnail May 19, 2021
5190f5a
Merge pull request #297 from microsoft/master
SparkSnail May 24, 2021
805e773
Merge pull request #298 from microsoft/master
SparkSnail May 26, 2021
f9dbdb4
Merge pull request #299 from microsoft/master
SparkSnail Jun 23, 2021
437b020
Merge pull request #300 from microsoft/master
SparkSnail Jul 6, 2021
899d231
add doc
SparkSnail Jul 8, 2021
3605174
Merge pull request #301 from microsoft/master
SparkSnail Jul 23, 2021
9e31b42
fix conflict
SparkSnail Jul 23, 2021
b5a435f
remove unused file
SparkSnail Jul 26, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions docs/en_US/TrainingService/KubeflowMode.rst
Original file line number Diff line number Diff line change
Expand Up @@ -253,3 +253,41 @@ version check
NNI support version check feature in since version 0.6, `refer <PaiMode.rst>`__

Any problems when using NNI in Kubeflow mode, please create issues on `NNI Github repo <https://github.com/Microsoft/nni>`__.


Kubeflow reuse mode
----------------------
NNI support setting reuse mode for trial jobs. In reuse mode, NNI will submit a long-running trial runner process to occupy the container, and start trial jobs as the subprocess of the trial runner process, it means k8s do not need to schedule new container again, it just reuse old container.
Currently, kubeflow reuse mode only support V2 config.
Here is the example:

.. code-block:: yaml

searchSpaceFile: search_space.json
trialCommand: python3 mnist.py
trialGpuNumber: 0
trialConcurrency: 4
maxTrialNumber: 20
tuner:
name: TPE
classArgs:
optimize_mode: maximize
trainingService:
reuseMode: true
platform: kubeflow
worker:
command: python3 mnist.py
code_directory: .
dockerImage: msranni/nni
cpuNumber: 1
gpuNumber: 0
memorySize: 8192
replicas: 1
operator: tf-operator
storage:
storageType: azureStorage
azureAccount: {your_account}
azureShare: {your_share}
keyVaultName: {your_valut_name}
keyVaultKey: {your_valut_key}
apiVersion: v1
34 changes: 19 additions & 15 deletions nni/experiment/config/kubeflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,21 @@
from .common import TrainingServiceConfig
from . import util

__all__ = ['KubeflowConfig', 'KubeflowRoleConfig', 'KubeflowNfsConfig', 'KubeflowAzureStorageConfig']
__all__ = ['KubeflowConfig', 'KubeflowRoleConfig', 'KubeflowStorageConfig', 'KubeflowNfsConfig', 'KubeflowAzureStorageConfig']


@dataclass(init=False)
class _KubeflowStorageConfig(ConfigBase):
storage: str
class KubeflowStorageConfig(ConfigBase):
storage_type: str
server: Optional[str] = None
path: Optional[str] = None
azure_account: Optional[str] = None
azure_share: Optional[str] = None
key_vault: Optional[str] = None
key_vault_secret: Optional[str] = None
key_vault_name: Optional[str] = None
key_vault_key: Optional[str] = None

@dataclass(init=False)
class KubeflowNfsConfig(_KubeflowStorageConfig):
class KubeflowNfsConfig(KubeflowStorageConfig):
Copy link
Contributor

@liuzhe-lz liuzhe-lz Jul 19, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't reusable ES use storage service?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only openPAI environment service support storage service.

storage: str = 'nfs'
server: str
path: str
Expand All @@ -32,37 +32,41 @@ class KubeflowAzureStorageConfig(ConfigBase):
storage: str = 'azureStorage'
azure_account: str
azure_share: str
key_vault: str
key_vault_secret: str
key_vault_name: str
key_vault_key: str


@dataclass(init=False)
class KubeflowRoleConfig(ConfigBase):
replicas: int
command: str
gpu_number: int
gpu_number: Optional[int] = 0
cpu_number: int
memory_size: str
docker_image: str = 'msranni/nni:latest'
code_directory: str


@dataclass(init=False)
class KubeflowConfig(TrainingServiceConfig):
platform: str = 'kubeflow'
operator: str
api_version: str
storage: _KubeflowStorageConfig
worker: KubeflowRoleConfig
parameter_server: Optional[KubeflowRoleConfig] = None
storage: KubeflowStorageConfig
worker: Optional[KubeflowRoleConfig] = None
ps: Optional[KubeflowRoleConfig] = None
master: Optional[KubeflowRoleConfig] = None
reuse_mode: Optional[bool] = True #set reuse mode as true for v2 config

def __init__(self, **kwargs):
kwargs = util.case_insensitive(kwargs)
kwargs['storage'] = util.load_config(_KubeflowStorageConfig, kwargs.get('storage'))
kwargs['storage'] = util.load_config(KubeflowStorageConfig, kwargs.get('storage'))
kwargs['worker'] = util.load_config(KubeflowRoleConfig, kwargs.get('worker'))
kwargs['parameterserver'] = util.load_config(KubeflowRoleConfig, kwargs.get('parameterserver'))
kwargs['ps'] = util.load_config(KubeflowRoleConfig, kwargs.get('ps'))
kwargs['master'] = util.load_config(KubeflowRoleConfig, kwargs.get('master'))
super().__init__(**kwargs)

_validation_rules = {
'platform': lambda value: (value == 'kubeflow', 'cannot be modified'),
'operator': lambda value: value in ['tf-operator', 'pytorch-operator']
}
}
30 changes: 18 additions & 12 deletions ts/nni_manager/common/experimentConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@

import * as assert from 'assert';

import { KubeflowOperator, OperatorApiVersion } from '../training_service/kubernetes/kubeflow/kubeflowConfig'
import { KubernetesStorageKind } from '../training_service/kubernetes/kubernetesConfig';

export interface TrainingServiceConfig {
platform: string;
}
Expand Down Expand Up @@ -68,35 +71,38 @@ export interface AmlConfig extends TrainingServiceConfig {
maxTrialNumberPerGpu: number;
}

/* Kubeflow */

// FIXME: merge with shared storage config
export interface KubeflowStorageConfig {
storage: string;
storageType: string;
maxTrialNumberPerGpu?: number;
server?: string;
path?: string;
azureAccount?: string;
azureShare?: string;
keyVault?: string;
keyVaultSecret?: string;
keyVaultName?: string;
keyVaultKey?: string;
}

export interface KubeflowRoleConfig {
replicas: number;
codeDirectory: string;
command: string;
gpuNumber: number;
cpuNumber: number;
memorySize: string;
memorySize: number;
dockerImage: string;
privateRegistryAuthPath?: string;
}

export interface KubeflowConfig extends TrainingServiceConfig {
platform: 'kubeflow';
operator: string;
apiVersion: string;
ps?: KubeflowRoleConfig;
master?: KubeflowRoleConfig;
worker?: KubeflowRoleConfig;
maxTrialNumberPerGpu: number;
operator: KubeflowOperator;
apiVersion: OperatorApiVersion;
storage: KubeflowStorageConfig;
worker: KubeflowRoleConfig;
parameterServer?: KubeflowRoleConfig;
reuseMode: boolean;
}

/* FrameworkController */
Expand Down Expand Up @@ -221,4 +227,4 @@ export function flattenConfig<T>(config: ExperimentConfig, platform: string): T
Object.assign(flattened, config.trainingService);
}
return <T>flattened;
}
}
3 changes: 0 additions & 3 deletions ts/nni_manager/core/nnimanager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -453,9 +453,6 @@ class NNIManager implements Manager {
if (platform === 'local') {
const module_ = await import('../training_service/local/localTrainingService');
return new module_.LocalTrainingService(config);
} else if (platform === 'kubeflow') {
const module_ = await import('../training_service/kubernetes/kubeflow/kubeflowTrainingService');
return new module_.KubeflowTrainingService();
} else if (platform === 'frameworkcontroller') {
const module_ = await import('../training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService');
return new module_.FrameworkControllerTrainingService();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { AMLEnvironmentService } from './amlEnvironmentService';
import { OpenPaiEnvironmentService } from './openPaiEnvironmentService';
import { LocalEnvironmentService } from './localEnvironmentService';
import { RemoteEnvironmentService } from './remoteEnvironmentService';
import { KubeflowEnvironmentService } from './kubernetes/kubeflowEnvironmentService';
import { EnvironmentService } from '../environment';
import { ExperimentConfig } from '../../../common/experimentConfig';
import { ExperimentStartupInfo } from '../../../common/experimentStartupInfo';
Expand All @@ -20,6 +21,8 @@ export async function createEnvironmentService(name: string, config: ExperimentC
return new AMLEnvironmentService(config, info);
case 'openpai':
return new OpenPaiEnvironmentService(config, info);
case 'kubeflow':
return new KubeflowEnvironmentService(config, info);
}

const esConfig = await getCustomEnvironmentServiceConfig(name);
Expand All @@ -29,4 +32,4 @@ export async function createEnvironmentService(name: string, config: ExperimentC
const esModule = importModule(esConfig.nodeModulePath);
const esClass = esModule[esConfig.nodeClassName] as any;
return new esClass(config, info);
}
}
Loading