Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

add experiment info when use python launch #3210

Merged
merged 3 commits into from
Dec 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion nni/experiment/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from . import launcher
from .pipe import Pipe
from . import rest
from ..tools.nnictl.command_utils import kill_command

nni.runtime.log.init_logger_experiment()
_logger = logging.getLogger('nni.experiment')
Expand Down Expand Up @@ -142,7 +143,7 @@ def stop(self) -> None:
atexit.unregister(self.stop)

if self._proc is not None:
self._proc.kill()
kill_command(self._proc.pid)
if self._pipe is not None:
self._pipe.close()
if self._dispatcher_thread is not None:
Expand Down
14 changes: 11 additions & 3 deletions nni/experiment/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from . import management
from .pipe import Pipe
from . import rest
from ..tools.nnictl.config_utils import Experiments

_logger = logging.getLogger('nni.experiment')

Expand All @@ -34,13 +35,15 @@ def start_experiment(config: ExperimentConfig, port: int, debug: bool) -> Tuple[
try:
_logger.info('Creating experiment %s%s', colorama.Fore.CYAN, exp_id)
pipe = Pipe(exp_id)
proc = _start_rest_server(config, port, debug, exp_id, pipe.path)
start_time, proc = _start_rest_server(config, port, debug, exp_id, pipe.path)
_logger.info('Connecting IPC pipe...')
pipe_file = pipe.connect()
nni.runtime.protocol._in_file = pipe_file
nni.runtime.protocol._out_file = pipe_file
_logger.info('Statring web server...')
_check_rest_server(port)
_save_experiment_information(exp_id, port, start_time, config.training_service.platform,
config.experiment_name, proc.pid, config.experiment_working_directory)
_logger.info('Setting up...')
_init_experiment(config, port, debug)
return proc, pipe
Expand All @@ -64,7 +67,7 @@ def _ensure_port_idle(port: int, message: Optional[str] = None) -> None:
raise RuntimeError(f'Port {port} is not idle {message}')


def _start_rest_server(config: ExperimentConfig, port: int, debug: bool, experiment_id: str, pipe_path: str) -> Popen:
def _start_rest_server(config: ExperimentConfig, port: int, debug: bool, experiment_id: str, pipe_path: str) -> Tuple[int, Popen]:
ts = config.training_service.platform
if ts == 'openpai':
ts = 'pai'
Expand All @@ -85,7 +88,7 @@ def _start_rest_server(config: ExperimentConfig, port: int, debug: bool, experim
for arg_key, arg_value in args.items():
cmd.append('--' + arg_key)
cmd.append(str(arg_value))
return Popen(cmd, cwd=node_dir)
return int(time.time() * 1000), Popen(cmd, cwd=node_dir)


def _check_rest_server(port: int, retry: int = 3) -> None:
Expand All @@ -103,3 +106,8 @@ def _init_experiment(config: ExperimentConfig, port: int, debug: bool) -> None:
for cluster_metadata in convert.to_cluster_metadata(config):
rest.put(port, '/experiment/cluster-metadata', cluster_metadata)
rest.post(port, '/experiment', convert.to_rest_json(config))


def _save_experiment_information(experiment_id: str, port: int, start_time: int, platform: str, name: str, pid: int, logDir: str) -> None:
experiment_config = Experiments()
experiment_config.add_experiment(experiment_id, port, start_time, platform, name, pid=pid, logDir=logDir)
4 changes: 2 additions & 2 deletions ts/nni_manager/common/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -434,8 +434,8 @@ function withLockSync(func: Function, filePath: string, lockOpts: {[key: string]
const lockPath = path.join(path.dirname(filePath), path.basename(filePath) + '.lock.*');
const lockFileNames: string[] = glob.sync(lockPath);
const canLock: boolean = lockFileNames.map((fileName) => {
return fs.existsSync(fileName) && Date.now() - fs.statSync(fileName).mtimeMs > lockOpts.stale;
}).filter(isExpired=>isExpired === false).length === 0;
return fs.existsSync(fileName) && Date.now() - fs.statSync(fileName).mtimeMs < lockOpts.stale;
}).filter(unexpired=>unexpired === true).length === 0;
if (!canLock) {
throw new Error('File has been locked.');
}
Expand Down