Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
add experiment info when use python launch (#3210)
Browse files Browse the repository at this point in the history
* add experiment info in python launch

* fix type hint
  • Loading branch information
J-shang authored Dec 21, 2020
1 parent e020814 commit 5f0ac59
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 6 deletions.
3 changes: 2 additions & 1 deletion nni/experiment/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from . import launcher
from .pipe import Pipe
from . import rest
from ..tools.nnictl.command_utils import kill_command

nni.runtime.log.init_logger_experiment()
_logger = logging.getLogger('nni.experiment')
Expand Down Expand Up @@ -142,7 +143,7 @@ def stop(self) -> None:
atexit.unregister(self.stop)

if self._proc is not None:
self._proc.kill()
kill_command(self._proc.pid)
if self._pipe is not None:
self._pipe.close()
if self._dispatcher_thread is not None:
Expand Down
14 changes: 11 additions & 3 deletions nni/experiment/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from . import management
from .pipe import Pipe
from . import rest
from ..tools.nnictl.config_utils import Experiments

_logger = logging.getLogger('nni.experiment')

Expand All @@ -34,13 +35,15 @@ def start_experiment(config: ExperimentConfig, port: int, debug: bool) -> Tuple[
try:
_logger.info('Creating experiment %s%s', colorama.Fore.CYAN, exp_id)
pipe = Pipe(exp_id)
proc = _start_rest_server(config, port, debug, exp_id, pipe.path)
start_time, proc = _start_rest_server(config, port, debug, exp_id, pipe.path)
_logger.info('Connecting IPC pipe...')
pipe_file = pipe.connect()
nni.runtime.protocol._in_file = pipe_file
nni.runtime.protocol._out_file = pipe_file
_logger.info('Statring web server...')
_check_rest_server(port)
_save_experiment_information(exp_id, port, start_time, config.training_service.platform,
config.experiment_name, proc.pid, config.experiment_working_directory)
_logger.info('Setting up...')
_init_experiment(config, port, debug)
return proc, pipe
Expand All @@ -64,7 +67,7 @@ def _ensure_port_idle(port: int, message: Optional[str] = None) -> None:
raise RuntimeError(f'Port {port} is not idle {message}')


def _start_rest_server(config: ExperimentConfig, port: int, debug: bool, experiment_id: str, pipe_path: str) -> Popen:
def _start_rest_server(config: ExperimentConfig, port: int, debug: bool, experiment_id: str, pipe_path: str) -> Tuple[int, Popen]:
ts = config.training_service.platform
if ts == 'openpai':
ts = 'pai'
Expand All @@ -85,7 +88,7 @@ def _start_rest_server(config: ExperimentConfig, port: int, debug: bool, experim
for arg_key, arg_value in args.items():
cmd.append('--' + arg_key)
cmd.append(str(arg_value))
return Popen(cmd, cwd=node_dir)
return int(time.time() * 1000), Popen(cmd, cwd=node_dir)


def _check_rest_server(port: int, retry: int = 3) -> None:
Expand All @@ -103,3 +106,8 @@ def _init_experiment(config: ExperimentConfig, port: int, debug: bool) -> None:
for cluster_metadata in convert.to_cluster_metadata(config):
rest.put(port, '/experiment/cluster-metadata', cluster_metadata)
rest.post(port, '/experiment', convert.to_rest_json(config))


def _save_experiment_information(experiment_id: str, port: int, start_time: int, platform: str, name: str, pid: int, logDir: str) -> None:
experiment_config = Experiments()
experiment_config.add_experiment(experiment_id, port, start_time, platform, name, pid=pid, logDir=logDir)
4 changes: 2 additions & 2 deletions ts/nni_manager/common/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -434,8 +434,8 @@ function withLockSync(func: Function, filePath: string, lockOpts: {[key: string]
const lockPath = path.join(path.dirname(filePath), path.basename(filePath) + '.lock.*');
const lockFileNames: string[] = glob.sync(lockPath);
const canLock: boolean = lockFileNames.map((fileName) => {
return fs.existsSync(fileName) && Date.now() - fs.statSync(fileName).mtimeMs > lockOpts.stale;
}).filter(isExpired=>isExpired === false).length === 0;
return fs.existsSync(fileName) && Date.now() - fs.statSync(fileName).mtimeMs < lockOpts.stale;
}).filter(unexpired=>unexpired === true).length === 0;
if (!canLock) {
throw new Error('File has been locked.');
}
Expand Down

0 comments on commit 5f0ac59

Please sign in to comment.