-
Notifications
You must be signed in to change notification settings - Fork 1.8k
experiment management backend #3081
Changes from 35 commits
0a595a9
cb7485b
f4ffbee
4fb7c33
d29d85b
e39de46
02952af
594619a
1c4aabd
5c8f59a
ea0b553
863fc1d
eeedf3d
b83d9aa
fbe4d7c
a2fbc55
8ec133f
df01921
a100f10
41b6eac
23bb387
13d6b07
fbdb128
350fe92
8c63832
e30c584
761786e
8da1700
3174249
30ce94e
7b1853e
b1ada7a
ccd906a
1950597
155c132
df491d4
4133249
4dd700a
fa75dc3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,11 +5,13 @@ | |
import sys | ||
import json | ||
import tempfile | ||
import time | ||
import socket | ||
import string | ||
import random | ||
import ruamel.yaml as yaml | ||
import psutil | ||
import filelock | ||
from colorama import Fore | ||
|
||
from .constants import ERROR_INFO, NORMAL_INFO, WARNING_INFO | ||
|
@@ -95,3 +97,24 @@ def generate_folder_name(): | |
temp_dir = generate_folder_name() | ||
os.makedirs(temp_dir) | ||
return temp_dir | ||
|
||
class SimpleFileLock(filelock.SoftFileLock): | ||
'''this is a lock support check lock expiration, if you do not need check expiration, you can use SoftFileLock''' | ||
def __init__(self, lock_file, timeout=-1, stale=-1): | ||
super(__class__, self).__init__(lock_file, timeout) | ||
self._stale = stale | ||
|
||
def __enter__(self): | ||
count = 0 | ||
while True: | ||
try: | ||
if os.path.isfile(self._lock_file) and time.time() - os.stat(self._lock_file).st_mtime > self._stale: | ||
os.remove(self._lock_file) | ||
self.acquire() | ||
return self | ||
except Exception: | ||
print_warning('[{}] fail lock file, auto try again!'.format(count)) | ||
count += 1 | ||
|
||
def get_file_lock(path: string, timeout=-1, stale=-1): | ||
return SimpleFileLock(path + '.lock', timeout=timeout, stale=stale) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is the purpose to set stale = 10 seconds? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Setting |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,8 +4,10 @@ | |
import os | ||
import json | ||
import shutil | ||
import time | ||
from .constants import NNICTL_HOME_DIR | ||
from .command_utils import print_error | ||
from .common_utils import get_file_lock | ||
|
||
class Config: | ||
'''a util class to load and save config''' | ||
|
@@ -34,7 +36,7 @@ def write_file(self): | |
if self.config: | ||
try: | ||
with open(self.config_file, 'w') as file: | ||
json.dump(self.config, file) | ||
json.dump(self.config, file, indent=4) | ||
except IOError as error: | ||
print('Error:', error) | ||
return | ||
|
@@ -54,39 +56,53 @@ class Experiments: | |
def __init__(self, home_dir=NNICTL_HOME_DIR): | ||
os.makedirs(home_dir, exist_ok=True) | ||
self.experiment_file = os.path.join(home_dir, '.experiment') | ||
self.experiments = self.read_file() | ||
self.lock = get_file_lock(self.experiment_file, timeout=1, stale=2) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is the problem that we try to solve with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the name There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We will check if the lock file is modified over two seconds to determine whether the current lock need to be forced released. This because we assume that all operations on the The
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could solve this problem with special handling the timeout exception, no need to introduce
|
||
with self.lock: | ||
self.experiments = self.read_file() | ||
|
||
def add_experiment(self, expId, port, startTime, file_name, platform, experiment_name, endTime='N/A', status='INITIALIZED'): | ||
'''set {key:value} paris to self.experiment''' | ||
self.experiments[expId] = {} | ||
self.experiments[expId]['port'] = port | ||
self.experiments[expId]['startTime'] = startTime | ||
self.experiments[expId]['endTime'] = endTime | ||
self.experiments[expId]['status'] = status | ||
self.experiments[expId]['fileName'] = file_name | ||
self.experiments[expId]['platform'] = platform | ||
self.experiments[expId]['experimentName'] = experiment_name | ||
self.write_file() | ||
def add_experiment(self, expId, port, startTime, platform, experiment_name, endTime='N/A', status='INITIALIZED', | ||
tag=[], pid=None, webuiUrl=[], logDir=[]): | ||
'''set {key:value} pairs to self.experiment''' | ||
with self.lock: | ||
self.experiments = self.read_file() | ||
self.experiments[expId] = {} | ||
self.experiments[expId]['id'] = expId | ||
self.experiments[expId]['port'] = port | ||
self.experiments[expId]['startTime'] = startTime | ||
self.experiments[expId]['endTime'] = endTime | ||
self.experiments[expId]['status'] = status | ||
self.experiments[expId]['platform'] = platform | ||
self.experiments[expId]['experimentName'] = experiment_name | ||
self.experiments[expId]['tag'] = tag | ||
self.experiments[expId]['pid'] = pid | ||
self.experiments[expId]['webuiUrl'] = webuiUrl | ||
self.experiments[expId]['logDir'] = logDir | ||
self.write_file() | ||
|
||
def update_experiment(self, expId, key, value): | ||
'''Update experiment''' | ||
if expId not in self.experiments: | ||
return False | ||
self.experiments[expId][key] = value | ||
self.write_file() | ||
return True | ||
with self.lock: | ||
self.experiments = self.read_file() | ||
if expId not in self.experiments: | ||
return False | ||
self.experiments[expId][key] = value | ||
self.write_file() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggest to add indent to dump the json file to make the file more readable. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed, I will add indent. |
||
return True | ||
|
||
def remove_experiment(self, expId): | ||
'''remove an experiment by id''' | ||
if expId in self.experiments: | ||
fileName = self.experiments.pop(expId).get('fileName') | ||
if fileName: | ||
logPath = os.path.join(NNICTL_HOME_DIR, fileName) | ||
try: | ||
shutil.rmtree(logPath) | ||
except FileNotFoundError: | ||
print_error('{0} does not exist.'.format(logPath)) | ||
self.write_file() | ||
with self.lock: | ||
self.experiments = self.read_file() | ||
if expId in self.experiments: | ||
self.experiments.pop(expId) | ||
fileName = expId | ||
if fileName: | ||
logPath = os.path.join(NNICTL_HOME_DIR, fileName) | ||
try: | ||
shutil.rmtree(logPath) | ||
except FileNotFoundError: | ||
print_error('{0} does not exist.'.format(logPath)) | ||
self.write_file() | ||
|
||
def get_all_experiments(self): | ||
'''return all of experiments''' | ||
|
@@ -96,7 +112,7 @@ def write_file(self): | |
'''save config to local file''' | ||
try: | ||
with open(self.experiment_file, 'w') as file: | ||
json.dump(self.experiments, file) | ||
json.dump(self.experiments, file, indent=4) | ||
except IOError as error: | ||
print('Error:', error) | ||
return '' | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
base class already has a loop, we can set
timeout=-1
for base class, then we do not need this__enter__
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we can rename the
check_interval
back tostale
with this new design.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indeed, we do not need
__enter__
any more, I will remove it.