-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetgpu
executable file
·141 lines (118 loc) · 3.37 KB
/
getgpu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python
import argparse
import contextlib
import errno
import fcntl
import os
import random
import signal
import subprocess
import stat
import sys
import time
import pynvml
DIR_NAME = '/tmp/getgpu'
PID = os.getpid()
def ignore(*args):
return
def message(s):
sys.stderr.write('getgpu [{}]: '.format(PID) + s + '\n')
def prepare_dir():
e = None
try:
os.mkdir(DIR_NAME, 0o777)
except OSError as e:
if e.errno == errno.EEXIST:
pass
else:
raise e
if not os.access(DIR_NAME, os.W_OK):
message('cannot write to {}'.format(DIR_NAME))
if e is not None:
message(str(e))
return False
return True
def claim(device, startup_time):
dir_fd = os.open(DIR_NAME, os.O_RDONLY, 0o777)
fcntl.flock(dir_fd, fcntl.LOCK_EX)
path = os.path.join(DIR_NAME, str(device))
try:
mtime = os.stat(path).st_mtime
except OSError as e:
if e.errno == errno.ENOENT:
mtime = 0
else:
raise e
# We can claim this GPU if the current time is past the mtime.
now = time.time()
if now >= mtime:
# Since we will claim this GPU, update mtime to now + startup_time.
fd = os.open(path, os.O_RDONLY | os.O_CREAT, 0o777)
new_time = int(now + startup_time)
os.utime(path, (new_time, new_time))
result = True
else:
result = False
fcntl.flock(dir_fd, fcntl.LOCK_UN)
return result
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
'-w',
'--wait',
type=int,
default=25,
help='If no GPUs are available, wait this much time before giving up.')
parser.add_argument(
'-s',
'--startup-time',
type=int,
default=30,
help='Maximum amount of seconds allowed for a process to begin using '
'the provisioned GPU. If nothing has begun actually using the GPU '
'after this many seconds, then a different instance of getgpu might '
'claim it for itself.')
parser.add_argument(
'--shell',
action='store_true')
parser.add_argument('program', nargs='+')
args = parser.parse_args()
# Initialization
old_umask = os.umask(0)
random.seed(os.getpid())
pynvml.nvmlInit()
if not prepare_dir():
sys.exit(-1)
start = now = time.time()
end = start + args.wait
while now <= end:
# Find which devices are occupied according to nvml
deviceCount = pynvml.nvmlDeviceGetCount()
unoccupied = []
for i in range(deviceCount):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
if not procs:
unoccupied.append(i)
# See if others have claimed any of the unoccupied devices already
random.shuffle(unoccupied)
for device in unoccupied:
if claim(device, args.startup_time):
message('Assigned GPU {}.'.format(device))
os.umask(old_umask)
os.environ['CUDA_VISIBLE_DEVICES'] = format(device)
signal.signal(signal.SIGINT, ignore)
if args.shell:
sys.exit(subprocess.call(' '.join(args.program), shell=True))
else:
sys.exit(subprocess.call(args.program))
# We didn't claim a GPU, so wait a random amount of time.
now = time.time()
if end < now:
break
message('waiting for GPU...')
time.sleep(min(args.wait, end - now, 1))
message('failed to obtain a GPU in {} seconds'.format(args.wait))
sys.exit(-1)
if __name__ == '__main__':
main()