Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
[DLC]: pai-dlc api update & log folder update (#4909)
Browse files Browse the repository at this point in the history
  • Loading branch information
weidankong authored Jun 13, 2022
1 parent 2815fb1 commit 7afe8a7
Show file tree
Hide file tree
Showing 7 changed files with 60 additions and 41 deletions.
1 change: 1 addition & 0 deletions docs/source/experiment/training_service/paidlc.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ Use ``examples/trials/mnist-pytorch`` as an example. The NNI config YAML file's
podCount: 1
ecsSpec: ecs.c6.large
region: cn-hangzhou
workspaceId: ${your_workspace_id}
accessKeyId: ${your_ak_id}
accessKeySecret: ${your_ak_key}
nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID, e.g., datat56by9n1xt0a
Expand Down
1 change: 1 addition & 0 deletions examples/trials/mnist-pytorch/config_dlc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ trainingService:
podCount: 1
ecsSpec: ecs.c6.large
region: cn-hangzhou
workspaceId: ${your_workspace_id}
accessKeyId: ${your_ak_id}
accessKeySecret: ${your_ak_key}
nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID,e.g., datat56by9n1xt0a
Expand Down
1 change: 1 addition & 0 deletions nni/experiment/config/training_services/dlc.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class DlcConfig(TrainingServiceConfig):
pod_count: int
ecs_spec: str # e.g.,'ecs.c6.large'
region: str
workspace_id: str
nas_data_source_id: str
oss_data_source_id: Optional[str] = None
access_key_id: str
Expand Down
1 change: 1 addition & 0 deletions ts/nni_manager/common/experimentConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ export interface DlcConfig extends TrainingServiceConfig {
podCount: number;
ecsSpec: string;
region: string;
workspaceId: string;
nasDataSourceId: string;
ossDataSourceId?: string;
accessKeyId: string;
Expand Down
89 changes: 49 additions & 40 deletions ts/nni_manager/config/dlc/dlcUtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
# Licensed under the MIT license.


import logging
import os
import pathlib
import sys
import traceback
from argparse import ArgumentParser
Expand All @@ -19,6 +21,7 @@
parser.add_argument('--pod_count', type=int, default=1, help='pod count')
parser.add_argument('--ecs_spec', help='ecs spec')
parser.add_argument('--region', help='region')
parser.add_argument('--workspace_id', help='workspace id for your project')
parser.add_argument('--nas_data_source_id', help='nas data_source_id of DLC dataset configuration')
parser.add_argument('--oss_data_source_id', help='oss data_source_id of DLC dataset configuration')
parser.add_argument('--access_key_id', help='access_key_id')
Expand All @@ -28,49 +31,56 @@
parser.add_argument('--log_dir', help='exception log dir')
args = parser.parse_args()

# init client
client = Client(
Config(
access_key_id=args.access_key_id,
access_key_secret=args.access_key_secret,
region_id=args.region,
endpoint=f'pai-dlc.{args.region}.aliyuncs.com'
)
)
pathlib.Path(args.log_dir).mkdir(parents=True, exist_ok=True)
logging.basicConfig(filename=os.path.join(args.log_dir, 'dlc_exception.log'),
format='%(asctime)s %(message)s',
level=logging.INFO)

nas_1 = DataSourceItem(
data_source_type='nas',
data_source_id=args.nas_data_source_id,
)
# DLC submit
try:

oss = None
if args.oss_data_source_id:
oss = DataSourceItem(
data_source_type='oss',
data_source_id=args.oss_data_source_id,
# init client
client = Client(
Config(
access_key_id=args.access_key_id,
access_key_secret=args.access_key_secret,
region_id=args.region,
endpoint=f'pai-dlc.{args.region}.aliyuncs.com'
)
)

# job spec
spec = JobSpec(
type=args.type,
image=args.image,
pod_count=args.pod_count,
ecs_spec=args.ecs_spec,
)
nas_1 = DataSourceItem(
data_source_type='nas',
data_source_id=args.nas_data_source_id,
)

data_sources = [nas_1]
if oss:
data_sources = [nas_1, oss]
req = CreateJobRequest(
display_name=args.experiment_name,
job_type=args.job_type,
job_specs=[spec],
data_sources=data_sources,
user_command=args.user_command
)
oss = None
if args.oss_data_source_id:
oss = DataSourceItem(
data_source_type='oss',
data_source_id=args.oss_data_source_id,
)

# job spec
spec = JobSpec(
type=args.type,
image=args.image,
pod_count=args.pod_count,
ecs_spec=args.ecs_spec,
)

data_sources = [nas_1]
if oss:
data_sources = [nas_1, oss]
req = CreateJobRequest(
display_name=args.experiment_name,
job_type=args.job_type,
job_specs=[spec],
data_sources=data_sources,
user_command=args.user_command,
workspace_id=args.workspace_id,
)

# DLC submit
try:
response = client.create_job(req)
job_id = response.body.job_id
print('job id: ' + job_id)
Expand All @@ -86,6 +96,5 @@
client.stop_job(job_id)
exit(0)
except Exception as e:
with open(os.path.join(args.log_dir, 'dlc_exception.log'), 'w') as f:
f.write('DLC submit Exception: \n')
traceback.print_exc(file=f)
logging.error('DLC submit Exception: \n')
logging.error(e, exc_info=1)
4 changes: 4 additions & 0 deletions ts/nni_manager/training_service/reusable/dlc/dlcClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ export class DlcClient {
public podCount: number;
public ecsSpec: string;
public region: string;
public workspaceId: string;
// e.g., data1e6vg1tu0zi7, to generate it, go to 'Dataset Config' page of DLC
// create a NAS data and copy the 'DataSet ConfigurationID'
public nasDataSourceId: string;
Expand All @@ -35,6 +36,7 @@ export class DlcClient {
environmentId: string,
ecsSpec: string,
region: string,
workspaceId: string,
nasDataSourceId: string,
accessKeyId: string,
accessKeySecret: string,
Expand All @@ -50,6 +52,7 @@ export class DlcClient {
this.ecsSpec = ecsSpec;
this.image = image;
this.region = region;
this.workspaceId = workspaceId;
this.nasDataSourceId = nasDataSourceId;
if (ossDataSourceId !== undefined) {
this.ossDataSourceId = ossDataSourceId;
Expand Down Expand Up @@ -77,6 +80,7 @@ export class DlcClient {
'--pod_count', String(this.podCount),
'--ecs_spec', this.ecsSpec,
'--region', this.region,
'--workspace_id', this.workspaceId,
'--nas_data_source_id', this.nasDataSourceId,
'--oss_data_source_id', this.ossDataSourceId,
'--access_key_id', this.accessKeyId,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import { FileCommandChannel } from '../channels/fileCommandChannel';
import { MountedStorageService } from '../storages/mountedStorageService';
import { Scope } from 'typescript-ioc';
import { StorageService } from '../storageService';
import { getLogDir } from 'common/utils';

/**
* Collector DLC jobs info from DLC cluster, and update dlc job status locally
Expand Down Expand Up @@ -112,11 +113,12 @@ export class DlcEnvironmentService extends EnvironmentService {
environment.id,
this.config.ecsSpec,
this.config.region,
this.config.workspaceId,
this.config.nasDataSourceId,
this.config.accessKeyId,
this.config.accessKeySecret,
environment.command,
dlcEnvironment.workingFolder,
path.join(getLogDir(), `envs/${environment.id}`),
this.config.ossDataSourceId,
);

Expand Down

0 comments on commit 7afe8a7

Please sign in to comment.