diff --git a/Makefile b/Makefile index b26f2c95df..959f6cfc94 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Setting variables SHELL := /bin/bash -PIP_INSTALL := python3 -m pip install --no-cache-dir +PIP_INSTALL := python3 -m pip install PIP_UNINSTALL := python3 -m pip uninstall ## Colorful output @@ -19,22 +19,25 @@ else endif ## Install directories -ROOT_FOLDER ?= $(shell python3 -c 'import site; from pathlib import Path; print(Path(site.getsitepackages()[0]).parents[2])') -IS_SYS_PYTHON ?= $(shell [[ $(ROOT_FOLDER) == /usr* || $(ROOT_FOLDER) == /Library* ]] && echo TRUE || echo FALSE) - -ifeq ($(shell id -u), 0) # is root - _ROOT := 1 - BASH_COMP_PREFIX ?= /usr/share/bash-completion/completions -else # is normal user - ifeq (TRUE, $(IS_SYS_PYTHON)) - ROOT_FOLDER := $(shell python3 -c 'import site; from pathlib import Path; print(Path(site.getusersitepackages()).parents[2])') - endif - ifndef VIRTUAL_ENV - ifeq (, $(shell echo $$PATH | grep 'conda')) + +## For apt-get or pip installed virtualenv +ifdef VIRTUAL_ENV + ROOT_FOLDER ?= $(VIRTUAL_ENV) + BASH_COMP_PREFIX ?= ${HOME}/.bash_completion.d +else + ROOT_FOLDER ?= $(shell python3 -c 'import site; from pathlib import Path; print(Path(site.getsitepackages()[0]).parents[2])') + IS_SYS_PYTHON ?= $(shell [[ $(ROOT_FOLDER) == /usr* || $(ROOT_FOLDER) == /Library* ]] && echo TRUE || echo FALSE) + + ifeq ($(shell id -u), 0) # is root + _ROOT := 1 + BASH_COMP_PREFIX ?= /usr/share/bash-completion/completions + else # is normal user + ifeq (TRUE, $(IS_SYS_PYTHON)) + ROOT_FOLDER := $(shell python3 -c 'import site; from pathlib import Path; print(Path(site.getusersitepackages()).parents[2])') PIP_MODE ?= --user endif + BASH_COMP_PREFIX ?= ${HOME}/.bash_completion.d endif - BASH_COMP_PREFIX ?= ${HOME}/.bash_completion.d endif BASH_COMP_SCRIPT := $(BASH_COMP_PREFIX)/nnictl diff --git a/README.md b/README.md index 4033bdcf60..63c7090f87 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ----------- -[![MIT licensed](https://img.shields.io/badge/license-MIT-yellow.svg)](https://github.com/Microsoft/nni/blob/master/LICENSE) +[![MIT licensed](https://img.shields.io/badge/license-MIT-brightgreen.svg)](https://github.com/Microsoft/nni/blob/master/LICENSE) [![Build Status](https://msrasrg.visualstudio.com/NNIOpenSource/_apis/build/status/Microsoft.nni)](https://msrasrg.visualstudio.com/NNIOpenSource/_build/latest?definitionId=6) [![Issues](https://img.shields.io/github/issues-raw/Microsoft/nni.svg)](https://github.com/Microsoft/nni/issues?q=is%3Aissue+is%3Aopen) [![Bugs](https://img.shields.io/github/issues/Microsoft/nni/bug.svg)](https://github.com/Microsoft/nni/issues?q=is%3Aissue+is%3Aopen+label%3Abug) diff --git a/docs/ExperimentConfig.md b/docs/ExperimentConfig.md index 8d98f413a9..1b9bdc6220 100644 --- a/docs/ExperimentConfig.md +++ b/docs/ExperimentConfig.md @@ -177,8 +177,18 @@ machineList: __nniManagerIp__ set the IP address of the machine on which nni manager process runs. This field is optional, and if it's not set, eth0 device IP will be used instead. Note: run ifconfig on NNI manager's machine to check if eth0 device exists. If not, we recommend to set nnimanagerIp explicitly. - - + +* __logDir__ + * Description + + __logDir__ configures the directory to store logs and data of the experiment. The default value is `/nni/experiment` + +* __logLevel__ + * Description + + __logLevel__ sets log level for the experiment, available log levels are: `trace, debug, info, warning, error, fatal`. The default value is `info`. + + * __tuner__ * Description diff --git a/docs/NNICTLDOC.md b/docs/NNICTLDOC.md index 5016ecef7d..ac2495d3d8 100644 --- a/docs/NNICTLDOC.md +++ b/docs/NNICTLDOC.md @@ -43,6 +43,7 @@ nnictl --version | ------ | ------ | ------ |------ | | --config, -c| True| |yaml configure file of the experiment| | --port, -p | False| |the port of restful server| + | --debug, -d | False| |Set log level to debug| * __nnictl resume__ @@ -62,6 +63,7 @@ nnictl --version | ------ | ------ | ------ |------ | | id| False| |The id of the experiment you want to resume| | --port, -p| False| |Rest port of the experiment you want to resume| + | --debug, -d | False| |Set log level to debug| * __nnictl stop__ * Description diff --git a/src/nni_manager/common/experimentStartupInfo.ts b/src/nni_manager/common/experimentStartupInfo.ts index 7e8c9e0307..c0c0c7a2a7 100644 --- a/src/nni_manager/common/experimentStartupInfo.ts +++ b/src/nni_manager/common/experimentStartupInfo.ts @@ -20,6 +20,8 @@ 'use strict'; import * as assert from 'assert'; +import * as os from 'os'; +import * as path from 'path'; import * as component from '../common/component'; @component.Singleton @@ -29,8 +31,10 @@ class ExperimentStartupInfo { private basePort: number = -1; private initialized: boolean = false; private initTrialSequenceID: number = 0; + private logDir: string = ''; + private logLevel: string = ''; - public setStartupInfo(newExperiment: boolean, experimentId: string, basePort: number): void { + public setStartupInfo(newExperiment: boolean, experimentId: string, basePort: number, logDir?: string, logLevel?: string): void { assert(!this.initialized); assert(experimentId.trim().length > 0); @@ -38,6 +42,16 @@ class ExperimentStartupInfo { this.experimentId = experimentId; this.basePort = basePort; this.initialized = true; + + if (logDir !== undefined && logDir.length > 0) { + this.logDir = path.join(logDir, getExperimentId()); + } else { + this.logDir = path.join(os.homedir(), 'nni', 'experiments', getExperimentId()); + } + + if (logLevel !== undefined && logLevel.length > 1) { + this.logLevel = logLevel; + } } public getExperimentId(): string { @@ -58,6 +72,18 @@ class ExperimentStartupInfo { return this.newExperiment; } + public getLogDir(): string { + assert(this.initialized); + + return this.logDir; + } + + public getLogLevel(): string { + assert(this.initialized); + + return this.logLevel; + } + public setInitTrialSequenceId(initSequenceId: number): void { assert(this.initialized); this.initTrialSequenceID = initSequenceId; @@ -90,9 +116,15 @@ function getInitTrialSequenceId(): number { return component.get(ExperimentStartupInfo).getInitTrialSequenceId(); } -function setExperimentStartupInfo(newExperiment: boolean, experimentId: string, basePort: number): void { - component.get(ExperimentStartupInfo).setStartupInfo(newExperiment, experimentId, basePort); +function getExperimentStartupInfo(): ExperimentStartupInfo { + return component.get(ExperimentStartupInfo); +} + +function setExperimentStartupInfo( + newExperiment: boolean, experimentId: string, basePort: number, logDir?: string, logLevel?: string): void { + component.get(ExperimentStartupInfo) + .setStartupInfo(newExperiment, experimentId, basePort, logDir, logLevel); } -export { ExperimentStartupInfo, getBasePort, getExperimentId, isNewExperiment, +export { ExperimentStartupInfo, getBasePort, getExperimentId, isNewExperiment, getExperimentStartupInfo, setExperimentStartupInfo, setInitTrialSequenceId, getInitTrialSequenceId }; diff --git a/src/nni_manager/common/log.ts b/src/nni_manager/common/log.ts index 5120145f21..0a2bfb483a 100644 --- a/src/nni_manager/common/log.ts +++ b/src/nni_manager/common/log.ts @@ -26,13 +26,18 @@ import { Writable } from 'stream'; import { WritableStreamBuffer } from 'stream-buffers'; import { format } from 'util'; import * as component from '../common/component'; +import { getExperimentStartupInfo } from './experimentStartupInfo'; import { getLogDir } from './utils'; -const CRITICAL: number = 1; +const FATAL: number = 1; const ERROR: number = 2; const WARNING: number = 3; const INFO: number = 4; const DEBUG: number = 5; +const TRACE: number = 6; + +const logLevelNameMap: Map = new Map([['fatal', FATAL], + ['error', ERROR], ['warning', WARNING], ['info', INFO], ['debug', DEBUG], ['trace', TRACE]]); class BufferSerialEmitter { private buffer: Buffer; @@ -83,12 +88,25 @@ class Logger { autoClose: true }); this.bufferSerialEmitter = new BufferSerialEmitter(this.writable); + + const logLevelName: string = getExperimentStartupInfo() + .getLogLevel(); + const logLevel: number | undefined = logLevelNameMap.get(logLevelName); + if (logLevel !== undefined) { + this.level = logLevel; + } } public close() { this.writable.destroy(); } + public trace(...param: any[]): void { + if (this.level >= TRACE) { + this.log('TRACE', param); + } + } + public debug(...param: any[]): void { if (this.level >= DEBUG) { this.log('DEBUG', param); @@ -113,8 +131,8 @@ class Logger { } } - public critical(...param: any[]): void { - this.log('CRITICAL', param); + public fatal(...param: any[]): void { + this.log('FATAL', param); } private log(level: string, param: any[]): void { diff --git a/src/nni_manager/common/utils.ts b/src/nni_manager/common/utils.ts index 3d4681c841..e5d8cee781 100644 --- a/src/nni_manager/common/utils.ts +++ b/src/nni_manager/common/utils.ts @@ -30,13 +30,14 @@ import { Container } from 'typescript-ioc'; import * as util from 'util'; import { Database, DataStore } from './datastore'; -import { ExperimentStartupInfo, getExperimentId, setExperimentStartupInfo } from './experimentStartupInfo'; +import { ExperimentStartupInfo, getExperimentId, getExperimentStartupInfo, setExperimentStartupInfo } from './experimentStartupInfo'; import { Manager } from './manager'; import { HyperParameters, TrainingService, TrialJobStatus } from './trainingService'; import { getLogger } from './log'; function getExperimentRootDir(): string { - return path.join(os.homedir(), 'nni', 'experiments', getExperimentId()); + return getExperimentStartupInfo() + .getLogDir(); } function getLogDir(): string{ diff --git a/src/nni_manager/core/nnimanager.ts b/src/nni_manager/core/nnimanager.ts index 1c11011b2d..b4f8559859 100644 --- a/src/nni_manager/core/nnimanager.ts +++ b/src/nni_manager/core/nnimanager.ts @@ -35,7 +35,7 @@ import { import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus } from '../common/trainingService'; -import { delay, getCheckpointDir, getLogDir, getMsgDispatcherCommand, mkDirP } from '../common/utils'; +import { delay, getCheckpointDir, getExperimentRootDir, getLogDir, getMsgDispatcherCommand, mkDirP } from '../common/utils'; import { ADD_CUSTOMIZED_TRIAL_JOB, INITIALIZE, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, PING, REPORT_METRIC_DATA, REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE @@ -670,7 +670,7 @@ class NNIManager implements Manager { id: getExperimentId(), revision: 0, execDuration: 0, - logDir: getLogDir(), + logDir: getExperimentRootDir(), maxSequenceId: 0, params: { authorName: '', diff --git a/src/nni_manager/main.ts b/src/nni_manager/main.ts index 414693db57..31e0255859 100644 --- a/src/nni_manager/main.ts +++ b/src/nni_manager/main.ts @@ -22,6 +22,7 @@ import { Container, Scope } from 'typescript-ioc'; import * as component from './common/component'; +import * as fs from 'fs'; import { Database, DataStore } from './common/datastore'; import { setExperimentStartupInfo } from './common/experimentStartupInfo'; import { getLogger, Logger } from './common/log'; @@ -40,10 +41,10 @@ import { PAITrainingService } from './training_service/pai/paiTrainingService'; import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService'; import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService'; -function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number) { +function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number, logDirectory: string, experimentLogLevel: string) { const createNew: boolean = (startExpMode === 'new'); const expId: string = createNew ? uniqueString(8) : resumeExperimentId; - setExperimentStartupInfo(createNew, expId, basePort); + setExperimentStartupInfo(createNew, expId, basePort, logDirectory, experimentLogLevel); } async function initContainer(platformMode: string): Promise { @@ -102,7 +103,19 @@ if (startMode === 'resume' && experimentId.trim().length < 1) { process.exit(1); } -initStartupInfo(startMode, experimentId, port); +const logDir: string = parseArg(['--log_dir', '-ld']); +if (logDir.length > 0) { + if (!fs.existsSync(logDir)) { + console.log(`FATAL: log_dir ${logDir} does not exist`); + } +} + +const logLevel: string = parseArg(['--log_level', '-ll']); +if (logLevel.length > 0 && !['debug', 'info', 'error', 'warning', 'critical'].includes(logLevel)) { + console.log(`FATAL: invalid log_level: ${logLevel}`); +} + +initStartupInfo(startMode, experimentId, port, logDir, logLevel); mkDirP(getLogDir()).then(async () => { const log: Logger = getLogger(); diff --git a/src/nni_manager/rest_server/restHandler.ts b/src/nni_manager/rest_server/restHandler.ts index f8f03039d8..5f722841bd 100644 --- a/src/nni_manager/rest_server/restHandler.ts +++ b/src/nni_manager/rest_server/restHandler.ts @@ -105,7 +105,7 @@ class NNIRestHandler { // If it's a fatal error, exit process if(isFatal) { - this.log.critical(err); + this.log.fatal(err); process.exit(1); } diff --git a/src/webui/src/components/SlideBar.tsx b/src/webui/src/components/SlideBar.tsx index 58f9b1ee90..6a57bb9e24 100644 --- a/src/webui/src/components/SlideBar.tsx +++ b/src/webui/src/components/SlideBar.tsx @@ -221,12 +221,12 @@ class SlideBar extends React.Component<{}, SliderState> { Download - + NNI github issue - FeedBack + Feedback Version: {version} diff --git a/src/webui/src/components/overview/BasicInfo.tsx b/src/webui/src/components/overview/BasicInfo.tsx index 53644dd6bc..ff5f773ff5 100644 --- a/src/webui/src/components/overview/BasicInfo.tsx +++ b/src/webui/src/components/overview/BasicInfo.tsx @@ -43,7 +43,7 @@ class BasicInfo extends React.Component { -

LogPath

+

Log Directory

{trialProfile.logDir} diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index eff323ab74..29bcf1aa2e 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -33,6 +33,8 @@ Optional('multiPhase'): bool, Optional('multiThread'): bool, Optional('nniManagerIp'): str, +Optional('logDir'): os.path.isdir, +Optional('logLevel'): Or('trace', 'debug', 'info', 'warning', 'error', 'fatal'), 'useAnnotation': bool, Optional('advisor'): Or({ 'builtinAdvisorName': Or('Hyperband'), diff --git a/tools/nni_cmd/launcher.py b/tools/nni_cmd/launcher.py index 81c4936f07..871f5e868b 100644 --- a/tools/nni_cmd/launcher.py +++ b/tools/nni_cmd/launcher.py @@ -58,8 +58,47 @@ def print_log_content(config_file_name): stderr_content = check_output(stderr_cmds) print(stderr_content.decode('utf-8')) +def get_nni_installation_path(): + ''' Find nni lib from the following locations in order + Return nni root directory if it exists + ''' + def try_installation_path_sequentially(*sitepackages): + '''Try different installation path sequentially util nni is found. + Return None if nothing is found + ''' + def _generate_installation_path(sitepackages_path): + python_dir = str(Path(sitepackages_path).parents[2]) + entry_file = os.path.join(python_dir, 'nni', 'main.js') + if os.path.isfile(entry_file): + return python_dir + return None + + for sitepackage in sitepackages: + python_dir = _generate_installation_path(sitepackage) + if python_dir: + return python_dir + return None + + if os.getenv('VIRTUAL_ENV'): + # if 'virtualenv' package is used, `site` has not attr getsitepackages, so we will instead use VIRTUAL_ENV + # Note that conda venv will not have VIRTUAL_ENV + python_dir = os.getenv('VIRTUAL_ENV') + else: + python_sitepackage = site.getsitepackages()[0] + # If system-wide python is used, we will give priority to using `local sitepackage`--"usersitepackages()" given that nni exists there + if python_sitepackage.startswith('/usr') or python_sitepackage.startswith('/Library'): + python_dir = try_installation_path_sequentially(site.getusersitepackages(), site.getsitepackages()[0]) + else: + python_dir = try_installation_path_sequentially(site.getsitepackages()[0], site.getusersitepackages()) + + if python_dir: + entry_file = os.path.join(python_dir, 'nni', 'main.js') + if os.path.isfile(entry_file): + return os.path.join(python_dir, 'nni') + print_error('Fail to find nni under python library') + exit(1) -def start_rest_server(port, platform, mode, config_file_name, experiment_id=None): +def start_rest_server(port, platform, mode, config_file_name, experiment_id=None, log_dir=None, log_level=None): '''Run nni manager process''' nni_config = Config(config_file_name) if detect_port(port): @@ -74,27 +113,15 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None exit(1) print_normal('Starting restful server...') - # Find nni lib from the following locations in order - sys_wide_python = True - python_sitepackage = site.getsitepackages()[0] - # If system-wide python is used, we will give priority to using user-sitepackage given that nni exists there - if python_sitepackage.startswith('/usr') or python_sitepackage.startswith('/Library'): - local_python_dir = str(Path(site.getusersitepackages()).parents[2]) - entry_file = os.path.join(local_python_dir, 'nni', 'main.js') - entry_dir = os.path.join(local_python_dir, 'nni') - else: - # If this python is not system-wide python, we will use its site-package directly - sys_wide_python = False - - if not sys_wide_python or not os.path.isfile(entry_file): - python_dir = str(Path(python_sitepackage).parents[2]) - entry_file = os.path.join(python_dir, 'nni', 'main.js') - entry_dir = os.path.join(python_dir, 'nni') - # Nothing is found - if not os.path.isfile(entry_file): - raise Exception('Fail to find nni under both "%s" and "%s"' % (local_python_dir, python_dir)) + + entry_dir = get_nni_installation_path() + entry_file = os.path.join(entry_dir, 'main.js') cmds = ['node', entry_file, '--port', str(port), '--mode', platform, '--start_mode', mode] + if log_dir is not None: + cmds += ['--log_dir', log_dir] + if log_level is not None: + cmds += ['--log_level', log_level] if mode == 'resume': cmds += ['--experiment_id', experiment_id] stdout_full_path, stderr_full_path = get_log_path(config_file_name) @@ -294,9 +321,12 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen except ModuleNotFoundError as e: print_error('The tuner %s should be installed through nnictl'%(tuner_name)) exit(1) - + log_dir = experiment_config['logDir'] if experiment_config.get('logDir') else None + log_level = experiment_config['logLevel'] if experiment_config.get('logLevel') else None + if log_level not in ['trace', 'debug'] and args.debug: + log_level = 'debug' # start rest server - rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, config_file_name, experiment_id) + rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, config_file_name, experiment_id, log_dir, log_level) nni_config.set_config('restServerPid', rest_process.pid) # Deal with annotation if experiment_config.get('useAnnotation'): @@ -310,8 +340,8 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen experiment_config['searchSpace'] = json.dumps(search_space) assert search_space, ERROR_INFO % 'Generated search space is empty' elif experiment_config.get('searchSpacePath'): - search_space = get_json_content(experiment_config.get('searchSpacePath')) - experiment_config['searchSpace'] = json.dumps(search_space) + search_space = get_json_content(experiment_config.get('searchSpacePath')) + experiment_config['searchSpace'] = json.dumps(search_space) else: experiment_config['searchSpace'] = json.dumps('') diff --git a/tools/nni_cmd/nnictl.py b/tools/nni_cmd/nnictl.py index f0825f4411..e6453ab60c 100644 --- a/tools/nni_cmd/nnictl.py +++ b/tools/nni_cmd/nnictl.py @@ -51,12 +51,14 @@ def parse_args(): parser_start = subparsers.add_parser('create', help='create a new experiment') parser_start.add_argument('--config', '-c', required=True, dest='config', help='the path of yaml config file') parser_start.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server') + parser_start.add_argument('--debug', '-d', action='store_true', help=' set log level to debug') parser_start.set_defaults(func=create_experiment) # parse resume command parser_resume = subparsers.add_parser('resume', help='resume a new experiment') parser_resume.add_argument('id', nargs='?', help='The id of the experiment you want to resume') parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server') + parser_resume.add_argument('--debug', '-d', action='store_true', help=' set log level to debug') parser_resume.set_defaults(func=resume_experiment) # parse update command diff --git a/tools/nni_trial_tool/log_utils.py b/tools/nni_trial_tool/log_utils.py index b1e6e66bf5..d4b1cf60f7 100644 --- a/tools/nni_trial_tool/log_utils.py +++ b/tools/nni_trial_tool/log_utils.py @@ -38,11 +38,12 @@ @unique class LogType(Enum): + Trace = 'TRACE' Debug = 'DEBUG' Info = 'INFO' Warning = 'WARNING' Error = 'ERROR' - Critical = 'CRITICAL' + Fatal = 'FATAL' @unique class StdOutputType(Enum):