-
Notifications
You must be signed in to change notification settings - Fork 1.8k
experiment management backend #3081
Changes from 26 commits
0a595a9
cb7485b
f4ffbee
4fb7c33
d29d85b
e39de46
02952af
594619a
1c4aabd
5c8f59a
ea0b553
863fc1d
eeedf3d
b83d9aa
fbe4d7c
a2fbc55
8ec133f
df01921
a100f10
41b6eac
23bb387
13d6b07
fbdb128
350fe92
8c63832
e30c584
761786e
8da1700
3174249
30ce94e
7b1853e
b1ada7a
ccd906a
1950597
155c132
df491d4
4133249
4dd700a
fa75dc3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,11 +5,13 @@ | |
import sys | ||
import json | ||
import tempfile | ||
import time | ||
import socket | ||
import string | ||
import random | ||
import ruamel.yaml as yaml | ||
import psutil | ||
import filelock | ||
from colorama import Fore | ||
|
||
from .constants import ERROR_INFO, NORMAL_INFO, WARNING_INFO | ||
|
@@ -95,3 +97,23 @@ def generate_folder_name(): | |
temp_dir = generate_folder_name() | ||
os.makedirs(temp_dir) | ||
return temp_dir | ||
|
||
class SimpleFileLock(filelock.SoftFileLock): | ||
def __init__(self, lock_file, timeout=-1, stale=10): | ||
super(__class__, self).__init__(lock_file, timeout) | ||
self._stale = stale | ||
|
||
def __enter__(self): | ||
count = 0 | ||
while True: | ||
try: | ||
if os.path.isfile(self._lock_file) and time.time() - os.stat(self._lock_file).st_mtime > self._stale: | ||
os.remove(self._lock_file) | ||
self.acquire() | ||
return self | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. base class already has a loop, we can set There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we can rename the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed, we do not need |
||
except Exception: | ||
print_warning('[{}] fail lock file, auto try again!'.format(count)) | ||
count += 1 | ||
|
||
def get_file_lock(path: string, timeout=-1, stale=10): | ||
return SimpleFileLock(path + '.lock', timeout=timeout, stale=stale) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is the purpose to set stale = 10 seconds? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Setting |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,8 +4,10 @@ | |
import os | ||
import json | ||
import shutil | ||
import time | ||
from .constants import NNICTL_HOME_DIR | ||
from .command_utils import print_error | ||
from .common_utils import get_file_lock | ||
|
||
class Config: | ||
'''a util class to load and save config''' | ||
|
@@ -54,39 +56,52 @@ class Experiments: | |
def __init__(self, home_dir=NNICTL_HOME_DIR): | ||
os.makedirs(home_dir, exist_ok=True) | ||
self.experiment_file = os.path.join(home_dir, '.experiment') | ||
self.experiments = self.read_file() | ||
self.lock = get_file_lock(self.experiment_file, timeout=1, stale=2) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is the problem that we try to solve with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the name There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We will check if the lock file is modified over two seconds to determine whether the current lock need to be forced released. This because we assume that all operations on the The
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could solve this problem with special handling the timeout exception, no need to introduce
|
||
with self.lock: | ||
self.experiments = self.read_file() | ||
|
||
def add_experiment(self, expId, port, startTime, file_name, platform, experiment_name, endTime='N/A', status='INITIALIZED'): | ||
def add_experiment(self, expId, port, startTime, platform, experiment_name, endTime='N/A', status='INITIALIZED', | ||
tag=[], pid=None, webuiUrl=[], logDir=[]): | ||
'''set {key:value} paris to self.experiment''' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo: |
||
self.experiments[expId] = {} | ||
self.experiments[expId]['port'] = port | ||
self.experiments[expId]['startTime'] = startTime | ||
self.experiments[expId]['endTime'] = endTime | ||
self.experiments[expId]['status'] = status | ||
self.experiments[expId]['fileName'] = file_name | ||
self.experiments[expId]['platform'] = platform | ||
self.experiments[expId]['experimentName'] = experiment_name | ||
self.write_file() | ||
with self.lock: | ||
self.experiments = self.read_file() | ||
self.experiments[expId] = {} | ||
self.experiments[expId]['port'] = port | ||
self.experiments[expId]['startTime'] = startTime | ||
self.experiments[expId]['endTime'] = endTime | ||
self.experiments[expId]['status'] = status | ||
self.experiments[expId]['platform'] = platform | ||
self.experiments[expId]['experimentName'] = experiment_name | ||
self.experiments[expId]['tag'] = tag | ||
self.experiments[expId]['pid'] = pid | ||
self.experiments[expId]['webuiUrl'] = webuiUrl | ||
self.experiments[expId]['logDir'] = logDir | ||
self.write_file() | ||
|
||
def update_experiment(self, expId, key, value): | ||
'''Update experiment''' | ||
if expId not in self.experiments: | ||
return False | ||
self.experiments[expId][key] = value | ||
self.write_file() | ||
return True | ||
with self.lock: | ||
self.experiments = self.read_file() | ||
if expId not in self.experiments: | ||
return False | ||
self.experiments[expId][key] = value | ||
self.write_file() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggest to add indent to dump the json file to make the file more readable. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed, I will add indent. |
||
return True | ||
|
||
def remove_experiment(self, expId): | ||
'''remove an experiment by id''' | ||
if expId in self.experiments: | ||
fileName = self.experiments.pop(expId).get('fileName') | ||
if fileName: | ||
logPath = os.path.join(NNICTL_HOME_DIR, fileName) | ||
try: | ||
shutil.rmtree(logPath) | ||
except FileNotFoundError: | ||
print_error('{0} does not exist.'.format(logPath)) | ||
self.write_file() | ||
with self.lock: | ||
self.experiments = self.read_file() | ||
if expId in self.experiments: | ||
self.experiments.pop(expId) | ||
fileName = expId | ||
if fileName: | ||
logPath = os.path.join(NNICTL_HOME_DIR, fileName) | ||
try: | ||
shutil.rmtree(logPath) | ||
except FileNotFoundError: | ||
print_error('{0} does not exist.'.format(logPath)) | ||
self.write_file() | ||
|
||
def get_all_experiments(self): | ||
'''return all of experiments''' | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
better to add a doc string for this class