-
Notifications
You must be signed in to change notification settings - Fork 1.8k
fix gpu script permission issue #1707
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,10 +42,10 @@ import { | |
getVersion, uniqueString, unixPathJoin | ||
} from '../../common/utils'; | ||
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; | ||
import { GPU_INFO_COLLECTOR_FORMAT_LINUX, GPUSummary } from '../common/gpuData'; | ||
import { GPUSummary } from '../common/gpuData'; | ||
import { TrialConfig } from '../common/trialConfig'; | ||
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; | ||
import { execCopydir, execMkdir, execRemove, validateCodeDir } from '../common/util'; | ||
import { execCopydir, execMkdir, execRemove, validateCodeDir, getGpuMetricsCollectorBashScriptContent } from '../common/util'; | ||
import { GPUScheduler } from './gpuScheduler'; | ||
import { | ||
HOST_JOB_SHELL_FORMAT, RemoteCommandResult, REMOTEMACHINE_TRIAL_COMMAND_FORMAT, RemoteMachineMeta, | ||
|
@@ -334,8 +334,6 @@ class RemoteMachineTrainingService implements TrainingService { | |
break; | ||
case TrialConfigMetadataKey.MACHINE_LIST: | ||
await this.setupConnections(value); | ||
//remove local temp files | ||
await execRemove(this.getLocalGpuMetricCollectorDir()); | ||
break; | ||
case TrialConfigMetadataKey.TRIAL_CONFIG: | ||
const remoteMachineTrailConfig: TrialConfig = <TrialConfig>JSON.parse(value); | ||
|
@@ -428,34 +426,6 @@ class RemoteMachineTrainingService implements TrainingService { | |
return Promise.resolve(); | ||
} | ||
|
||
/** | ||
* Generate gpu metric collector directory to store temp gpu metric collector script files | ||
*/ | ||
private getLocalGpuMetricCollectorDir(): string { | ||
const userName: string = path.basename(os.homedir()); //get current user name of os | ||
|
||
return path.join(os.tmpdir(), userName, 'nni', 'scripts'); | ||
} | ||
|
||
/** | ||
* Generate gpu metric collector shell script in local machine, | ||
* used to run in remote machine, and will be deleted after uploaded from local. | ||
*/ | ||
private async generateGpuMetricsCollectorScript(userName: string): Promise<void> { | ||
const gpuMetricCollectorScriptFolder : string = this.getLocalGpuMetricCollectorDir(); | ||
await execMkdir(path.join(gpuMetricCollectorScriptFolder, userName)); | ||
//generate gpu_metrics_collector.sh | ||
const gpuMetricsCollectorScriptPath: string = path.join(gpuMetricCollectorScriptFolder, userName, 'gpu_metrics_collector.sh'); | ||
// This directory is used to store gpu_metrics and pid created by script | ||
const remoteGPUScriptsDir: string = this.getRemoteScriptsPath(userName); | ||
const gpuMetricsCollectorScriptContent: string = String.Format( | ||
GPU_INFO_COLLECTOR_FORMAT_LINUX, | ||
remoteGPUScriptsDir, | ||
unixPathJoin(remoteGPUScriptsDir, 'pid') | ||
); | ||
await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' }); | ||
} | ||
|
||
private async setupConnections(machineList: string): Promise<void> { | ||
this.log.debug(`Connecting to remote machines: ${machineList}`); | ||
const deferred: Deferred<void> = new Deferred<void>(); | ||
|
@@ -480,23 +450,18 @@ class RemoteMachineTrainingService implements TrainingService { | |
private async initRemoteMachineOnConnected(rmMeta: RemoteMachineMeta, conn: Client): Promise<void> { | ||
// Create root working directory after ssh connection is ready | ||
// generate gpu script in local machine first, will copy to remote machine later | ||
liuzhe-lz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
await this.generateGpuMetricsCollectorScript(rmMeta.username); | ||
const nniRootDir: string = unixPathJoin(getRemoteTmpDir(this.remoteOS), 'nni'); | ||
await SSHClientUtility.remoteExeCommand(`mkdir -p ${this.remoteExpRootDir}`, conn); | ||
|
||
// Copy NNI scripts to remote expeirment working directory | ||
const localGpuScriptCollectorDir: string = this.getLocalGpuMetricCollectorDir(); | ||
// the directory to store temp scripts in remote machine | ||
const remoteGpuScriptCollectorDir: string = this.getRemoteScriptsPath(rmMeta.username); | ||
await SSHClientUtility.remoteExeCommand(`mkdir -p ${remoteGpuScriptCollectorDir}`, conn); | ||
await SSHClientUtility.remoteExeCommand(`(umask 0 ; mkdir -p ${remoteGpuScriptCollectorDir})`, conn); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no need to add umask, because this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, then we are in trouble. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to blame this behaviour was introduced 10 months ago. I will be surprised if a bug can be hidden for so long. |
||
await SSHClientUtility.remoteExeCommand(`chmod 777 ${nniRootDir} ${nniRootDir}/* ${nniRootDir}/scripts/*`, conn); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is the dir used in remote machine of NNI. Normally it is |
||
//copy gpu_metrics_collector.sh to remote | ||
await SSHClientUtility.copyFileToRemote(path.join(localGpuScriptCollectorDir, rmMeta.username, 'gpu_metrics_collector.sh'), | ||
unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh'), conn); | ||
|
||
//Begin to execute gpu_metrics_collection scripts | ||
// tslint:disable-next-line: no-floating-promises | ||
SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh')}`, conn); | ||
const script = getGpuMetricsCollectorBashScriptContent(remoteGpuScriptCollectorDir); | ||
SSHClientUtility.remoteExeCommand(`bash -c '${script}'`, conn); | ||
|
||
const disposable: Rx.IDisposable = this.timer.subscribe( | ||
async (tick: number) => { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add
unmask 0
here?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this.gpuMetricCollectorScriptFolder
is${os.tmpdir()}/nni/script
, I think it is better to add username in it.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
true
meansshare=True
, soumask
is 0 here. As a magic valuetrue
is hard to understand and I'll add some comments.This folder is to store collected metrics. So it cannot have user name in it so long as the collector script runs exclusively. Consider user A starts a collector and it writes metrics to
/tmp/nni/A
; then user B will fail to start another collector and she does not know where the metrics locates in.