This repository has been archived by the owner on Sep 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1.8k
experiment management backend #3081
Merged
Merged
Changes from 37 commits
Commits
Show all changes
39 commits
Select commit
Hold shift + click to select a range
0a595a9
step 1 nnictl generate experimentId & merge folder
cb7485b
step 2.1 modify .experiment structure
f4ffbee
step 2.2 add lock for .experiment rw in nnictl
4fb7c33
step 2.2 add filelock dependence
d29d85b
step 2.2 remove uniqueString from main.js
e39de46
fix test bug
02952af
fix test bug
594619a
setp 3.1 add experiment manager
1c4aabd
step 3.2 add getExperimentsInfo
5c8f59a
fix eslint
ea0b553
add a simple file lock to support stale
863fc1d
step 3.3 add test
eeedf3d
divide abs experiment manager from manager
b83d9aa
experiment manager refactor
fbe4d7c
support .experiment sync update status
a2fbc55
nnictl no longer uses rest api to update status or endtime
8ec133f
nnictl no longer uses rest api to update status or endtime
df01921
fix eslint
a100f10
support .experiment sync update endtime
41b6eac
fix test
23bb387
fix settimeout bug
13d6b07
fix test
fbdb128
adjust experiment endTime
350fe92
separate simple file lock class
8c63832
Merge branch 'master' into experiment-backend
J-shang e30c584
modify name
761786e
add 'id' in .experiment
8da1700
Merge branch 'master' into experiment-backend
J-shang 3174249
update rest api format
30ce94e
fix eslint
7b1853e
fix issue in comments
b1ada7a
fix rest api format
ccd906a
add indent in json in experiments manager
1950597
fix unittest
155c132
fix unittest
df491d4
refector file lock
4133249
fix eslint
4dd700a
remove '__enter__' in filelock
fa75dc3
filelock support never expire
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,11 +5,14 @@ | |
import sys | ||
import json | ||
import tempfile | ||
import time | ||
import socket | ||
import string | ||
import random | ||
import ruamel.yaml as yaml | ||
import psutil | ||
import filelock | ||
import glob | ||
from colorama import Fore | ||
|
||
from .constants import ERROR_INFO, NORMAL_INFO, WARNING_INFO | ||
|
@@ -95,3 +98,43 @@ def generate_folder_name(): | |
temp_dir = generate_folder_name() | ||
os.makedirs(temp_dir) | ||
return temp_dir | ||
|
||
class SimplePreemptiveLock(filelock.SoftFileLock): | ||
'''this is a lock support check lock expiration, if you do not need check expiration, you can use SoftFileLock''' | ||
def __init__(self, lock_file, check_interval=-1): | ||
super(__class__, self).__init__(lock_file, check_interval) | ||
self._lock_file_name = '{}.{}'.format(self._lock_file, os.getpid()) | ||
|
||
def __enter__(self): | ||
while True: | ||
try: | ||
self.acquire() | ||
return self | ||
except TimeoutError: | ||
print_warning('fail lock file, auto try again!') | ||
|
||
def _acquire(self): | ||
open_mode = os.O_WRONLY | os.O_CREAT | os.O_EXCL | os.O_TRUNC | ||
try: | ||
lock_file_names = glob.glob(self._lock_file + '.*') | ||
for file_name in lock_file_names: | ||
if os.path.exists(file_name) and time.time() - os.stat(file_name).st_mtime < self._timeout: | ||
raise TimeoutError() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. return |
||
fd = os.open(self._lock_file_name, open_mode) | ||
except (IOError, OSError, TimeoutError): | ||
pass | ||
else: | ||
self._lock_file_fd = fd | ||
return None | ||
|
||
def _release(self): | ||
os.close(self._lock_file_fd) | ||
self._lock_file_fd = None | ||
try: | ||
os.remove(self._lock_file_name) | ||
except OSError: | ||
pass | ||
return None | ||
|
||
def get_file_lock(path: string, check_interval=-1): | ||
return SimplePreemptiveLock(path + '.lock', check_interval=-1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,8 +4,10 @@ | |
import os | ||
import json | ||
import shutil | ||
import time | ||
from .constants import NNICTL_HOME_DIR | ||
from .command_utils import print_error | ||
from .common_utils import get_file_lock | ||
|
||
class Config: | ||
'''a util class to load and save config''' | ||
|
@@ -34,7 +36,7 @@ def write_file(self): | |
if self.config: | ||
try: | ||
with open(self.config_file, 'w') as file: | ||
json.dump(self.config, file) | ||
json.dump(self.config, file, indent=4) | ||
except IOError as error: | ||
print('Error:', error) | ||
return | ||
|
@@ -54,39 +56,53 @@ class Experiments: | |
def __init__(self, home_dir=NNICTL_HOME_DIR): | ||
os.makedirs(home_dir, exist_ok=True) | ||
self.experiment_file = os.path.join(home_dir, '.experiment') | ||
self.experiments = self.read_file() | ||
self.lock = get_file_lock(self.experiment_file, check_interval=2) | ||
with self.lock: | ||
self.experiments = self.read_file() | ||
|
||
def add_experiment(self, expId, port, startTime, file_name, platform, experiment_name, endTime='N/A', status='INITIALIZED'): | ||
'''set {key:value} paris to self.experiment''' | ||
self.experiments[expId] = {} | ||
self.experiments[expId]['port'] = port | ||
self.experiments[expId]['startTime'] = startTime | ||
self.experiments[expId]['endTime'] = endTime | ||
self.experiments[expId]['status'] = status | ||
self.experiments[expId]['fileName'] = file_name | ||
self.experiments[expId]['platform'] = platform | ||
self.experiments[expId]['experimentName'] = experiment_name | ||
self.write_file() | ||
def add_experiment(self, expId, port, startTime, platform, experiment_name, endTime='N/A', status='INITIALIZED', | ||
tag=[], pid=None, webuiUrl=[], logDir=[]): | ||
'''set {key:value} pairs to self.experiment''' | ||
with self.lock: | ||
self.experiments = self.read_file() | ||
self.experiments[expId] = {} | ||
self.experiments[expId]['id'] = expId | ||
self.experiments[expId]['port'] = port | ||
self.experiments[expId]['startTime'] = startTime | ||
self.experiments[expId]['endTime'] = endTime | ||
self.experiments[expId]['status'] = status | ||
self.experiments[expId]['platform'] = platform | ||
self.experiments[expId]['experimentName'] = experiment_name | ||
self.experiments[expId]['tag'] = tag | ||
self.experiments[expId]['pid'] = pid | ||
self.experiments[expId]['webuiUrl'] = webuiUrl | ||
self.experiments[expId]['logDir'] = logDir | ||
self.write_file() | ||
|
||
def update_experiment(self, expId, key, value): | ||
'''Update experiment''' | ||
if expId not in self.experiments: | ||
return False | ||
self.experiments[expId][key] = value | ||
self.write_file() | ||
return True | ||
with self.lock: | ||
self.experiments = self.read_file() | ||
if expId not in self.experiments: | ||
return False | ||
self.experiments[expId][key] = value | ||
self.write_file() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggest to add indent to dump the json file to make the file more readable. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed, I will add indent. |
||
return True | ||
|
||
def remove_experiment(self, expId): | ||
'''remove an experiment by id''' | ||
if expId in self.experiments: | ||
fileName = self.experiments.pop(expId).get('fileName') | ||
if fileName: | ||
logPath = os.path.join(NNICTL_HOME_DIR, fileName) | ||
try: | ||
shutil.rmtree(logPath) | ||
except FileNotFoundError: | ||
print_error('{0} does not exist.'.format(logPath)) | ||
self.write_file() | ||
with self.lock: | ||
self.experiments = self.read_file() | ||
if expId in self.experiments: | ||
self.experiments.pop(expId) | ||
fileName = expId | ||
if fileName: | ||
logPath = os.path.join(NNICTL_HOME_DIR, fileName) | ||
try: | ||
shutil.rmtree(logPath) | ||
except FileNotFoundError: | ||
print_error('{0} does not exist.'.format(logPath)) | ||
self.write_file() | ||
|
||
def get_all_experiments(self): | ||
'''return all of experiments''' | ||
|
@@ -96,7 +112,7 @@ def write_file(self): | |
'''save config to local file''' | ||
try: | ||
with open(self.experiment_file, 'w') as file: | ||
json.dump(self.experiments, file) | ||
json.dump(self.experiments, file, indent=4) | ||
except IOError as error: | ||
print('Error:', error) | ||
return '' | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
base class already has a loop, we can set
timeout=-1
for base class, then we do not need this__enter__
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we can rename the
check_interval
back tostale
with this new design.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indeed, we do not need
__enter__
any more, I will remove it.