-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathgpu_lock.py
executable file
·154 lines (138 loc) · 4.68 KB
/
gpu_lock.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/python
"""
A simple discretionary locking system for /dev/nvidia devices.
Iain Murray, November 2009, January 2010.
"""
import os
import os.path
_dev_prefix = '/dev/nvidia'
# Get ID's of NVIDIA boards. Should do this through a CUDA call, but this is
# a quick and dirty way that works for now:
def board_ids():
"""Returns integer board ids available on this machine."""
from glob import glob
board_devs = glob(_dev_prefix + '[0-9]*')
return range(len(board_devs))
def _lock_file(id):
"""lock file from integer id"""
# /tmp is cleared on reboot on many systems, but it doesn't have to be
if os.path.exists('/dev/shm'):
# /dev/shm on linux machines is a RAM disk, so is definitely cleared
return '/dev/shm/gpu_lock_%d' % id
else:
return '/tmp/gpu_lock_%d' % id
def owner_of_lock(id):
"""Username that has locked the device id. (Empty string if no lock)."""
import pwd
try:
statinfo = os.lstat(_lock_file(id))
return pwd.getpwuid(statinfo.st_uid).pw_name
except:
return ""
def _obtain_lock(id):
"""Attempts to lock id, returning success as True/False."""
try:
# On POSIX systems symlink creation is atomic, so this should be a
# robust locking operation:
os.symlink('/dev/null', _lock_file(id))
return True
except:
return False
def _launch_reaper(id, pid):
"""Start a process that will free a lock when process pid terminates"""
from subprocess import Popen, PIPE
me = __file__
if me.endswith('.pyc'):
me = me[:-1]
myloc = os.path.dirname(me)
if not myloc:
myloc = os.getcwd()
reaper_cmd = os.path.join(myloc, 'run_on_me_or_pid_quit')
Popen([reaper_cmd, str(pid), me, '--free', str(id)],
stdout=open('/dev/null', 'w'))
def obtain_lock_id(pid = None):
"""
Finds a free id, locks it and returns integer id, or -1 if none free.
A process is spawned that will free the lock automatically when the
process pid (by default the current python process) terminates.
"""
id = -1
id = obtain_lock_id_to_hog()
try:
if id >= 0:
if pid is None:
pid = os.getpid()
_launch_reaper(id, pid)
except:
free_lock(id)
id = -1
return id
def obtain_lock_id_to_hog():
"""
Finds a free id, locks it and returns integer id, or -1 if none free.
* Lock must be freed manually *
"""
for id in board_ids():
if _obtain_lock(id):
return id
return -1
def free_lock(id):
"""Attempts to free lock id, returning success as True/False."""
try:
filename = _lock_file(id)
# On POSIX systems os.rename is an atomic operation, so this is the safe
# way to delete a lock:
os.rename(filename, filename + '.redundant')
os.remove(filename + '.redundant')
return True
except:
return False
# If run as a program:
if __name__ == "__main__":
import sys
me = sys.argv[0]
# Report
if '--id' in sys.argv:
if len(sys.argv) > 2:
try:
pid = int(sys.argv[2])
assert(os.path.exists('/proc/%d' % pid))
except:
print 'Usage: %s --id [pid_to_wait_on]' % me
print 'The optional process id must exist if specified.'
print 'Otherwise the id of the parent process is used.'
sys.exit(1)
else:
pid = os.getppid()
print obtain_lock_id(pid)
elif '--id-to-hog' in sys.argv:
print obtain_lock_id_to_hog()
elif '--free' in sys.argv:
try:
id = int(sys.argv[2])
except:
print 'Usage: %s --free <id>' % me
sys.exit(1)
if free_lock(id):
print "Lock freed"
else:
owner = owner_of_lock(id)
if owner:
print "Failed to free lock id=%d owned by %s" % (id, owner)
else:
print "Failed to free lock, but it wasn't actually set?"
else:
print '\n Usage instructions:\n'
print ' To obtain and lock an id: %s --id' % me
print ' The lock is automatically freed when the parent terminates'
print
print " To get an id that won't be freed: %s --id-to-hog" % me
print " You *must* manually free these ids: %s --free <id>\n" % me
print ' More info: http://www.cs.toronto.edu/~murray/code/gpu_monitoring/\n'
div = ' ' + "-"*60
print '\n' + div
print " NVIDIA board users:"
print div
for id in board_ids():
print " Board %d: %s" % (id, owner_of_lock(id))
print div + '\n'