-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathgradient_test_distdl.py
67 lines (54 loc) · 1.57 KB
/
gradient_test_distdl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import distdl.nn as dnn
import gc
import torch
import torch.nn as nn
from dfno.utils import create_standard_partitions
from dfno import BroadcastedAffineOperator
from gradient_test import gradient_test
P_world, P_x, P_0 = create_standard_partitions((1, 2))
_, P_y, _ = create_standard_partitions((2, 1))
# This network passes gradcheck
f = nn.Sequential(
nn.Linear(16, 16, dtype=torch.float64),
dnn.DistributedTranspose(P_x, P_y),
dnn.DistributedTranspose(P_y, P_x),
nn.Linear(16, 16, dtype=torch.float64)
)
# local shape
input_shape = (2, 16)
# Run test
all_ok = True
for r in gradient_test(f, input_shape):
if P_0.active:
print(str(r))
if r.active:
all_ok = all_ok and r.converged[0] and r.converged[1]
P_x._comm.Barrier()
if all_ok:
print(f'rank {P_x.rank} passed gradcheck 1')
else:
print(f'rank {P_x.rank} failed gradcheck 1')
P_x._comm.Barrier()
# This network does not pass gradcheck... why?
# Could be an issue with the partition activity
# vs the grad activity or something... very weird.
f = nn.Sequential(
nn.Linear(16, 16, dtype=torch.float64),
dnn.DistributedTranspose(P_x, P_y),
nn.Linear(32, 32, dtype=torch.float64),
dnn.DistributedTranspose(P_y, P_x)
)
# local shape
input_shape = (2, 16)
# Run test
all_ok = True
for r in gradient_test(f, input_shape):
if P_0.active:
print(str(r))
if r.active:
all_ok = all_ok and r.converged[0] and r.converged[1]
P_x._comm.Barrier()
if all_ok:
print(f'rank {P_x.rank} passed gradcheck 2')
else:
print(f'rank {P_x.rank} failed gradcheck 2')