forked from chenhongge/SA_DQN
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
187 lines (157 loc) · 7.79 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import math, random
import numpy as np
import torch
import torch.nn as nn
from torch import autograd
import torch.optim as optim
from IPython.display import clear_output
import matplotlib
import time
matplotlib.use('Agg')
import matplotlib.pyplot as plt
USE_CUDA = torch.cuda.is_available()
ACROBOT_STD=[0.36641926, 0.65119815, 0.6835106, 0.67652863, 2.0165246, 3.0202584]
Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)
class Logger(object):
def __init__(self, log_file = None):
self.log_file = log_file
def log(self, *args, **kwargs):
print(*args, **kwargs)
if self.log_file:
print(*args, **kwargs, file = self.log_file)
self.log_file.flush()
class ActEpsilonScheduler(object):
def __init__(self, epsilon_start = 1.0, epsilon_final = 0.01, epsilon_decay = 30000, method = 'linear', start_frame = 0, decay_zero = None):
self.epsilon_start = epsilon_start
self.epsilon_final = epsilon_final
self.epsilon_decay = epsilon_decay
self.method = method
self.start_frame = start_frame
self.decay_zero = decay_zero
def get(self, frame_idx):
if frame_idx < self.start_frame:
return self.epsilon_start
if self.method == 'exponential':
return self.epsilon_final + (self.epsilon_start - self.epsilon_final) * math.exp(-1. * (frame_idx - self.start_frame) / self.epsilon_decay)
else:
# linear decay
if self.decay_zero == None or self.decay_zero <= self.start_frame + self.epsilon_decay or frame_idx <= self.start_frame + self.epsilon_decay:
return max(self.epsilon_final, self.epsilon_start + (self.epsilon_final - self.epsilon_start) * (frame_idx - self.start_frame) * 1. / self.epsilon_decay)
else:
# second stage linear decay to 0
return max(0, self.epsilon_final * (self.decay_zero - frame_idx) / (self.decay_zero - self.start_frame - self.epsilon_decay))
class BufferBetaScheduler(object):
def __init__(self, beta_start = 0.4, beta_frames = 1000, start_frame = 0):
self.beta_start = beta_start
self.beta_frames = beta_frames
self.start_frame = start_frame
def get(self, frame_idx):
return max(self.beta_start, min(1.0, self.beta_start + (frame_idx - self.start_frame) * (1.0 - self.beta_start) / self.beta_frames))
class CudaTensorManager(object):
def __init__(self, state_shape, batch_size, per, use_cuda=True, dtype=np.uint8):
# Allocate pinned memory at once
# states and pinned states are allocated as uint8 to save transfer time
self.dtype = dtype
if dtype == np.uint8:
self.pinned_next_state = torch.empty(batch_size, *state_shape, dtype=torch.uint8, pin_memory=True)
self.pinned_state = torch.empty(batch_size, *state_shape, dtype=torch.uint8, pin_memory=True)
else:
self.pinned_next_state = torch.empty(batch_size, *state_shape, dtype=torch.float32, pin_memory=True)
self.pinned_state = torch.empty(batch_size, *state_shape, dtype=torch.float32, pin_memory=True)
self.pinned_reward = torch.empty(batch_size, dtype=torch.float32, pin_memory=True)
self.pinned_done = torch.empty(batch_size, dtype=torch.float32, pin_memory=True)
self.pinned_action = torch.empty(batch_size, dtype=torch.int64, pin_memory=True)
self.per = per
self.use_cuda = use_cuda
if self.per:
self.pinned_weights = torch.empty(batch_size, dtype=torch.float32, pin_memory=True)
self.ncall = 0
def get_cuda_tensors(self, state, next_state, action, reward, done, weights = None):
"""
state = torch.cuda.FloatTensor(state)
next_state = torch.cuda.FloatTensor(next_state)
action = torch.cuda.LongTensor(action)
reward = torch.cuda.FloatTensor(reward)
done = torch.cuda.FloatTensor(done)
if self.per:
weights = torch.cuda.FloatTensor(weights)
return state, next_state, action, reward, done, weights
"""
# Copy numpy array to pinned memory
t = time.time()
if self.dtype == np.uint8:
self.pinned_next_state.copy_(torch.from_numpy(next_state.astype(np.uint8)))
self.pinned_state.copy_(torch.from_numpy(state.astype(np.uint8)))
else:
self.pinned_next_state.copy_(torch.from_numpy(next_state.astype(self.dtype)))
self.pinned_state.copy_(torch.from_numpy(state.astype(self.dtype)))
self.pinned_reward.copy_(torch.from_numpy(reward))
self.pinned_done.copy_(torch.from_numpy(done))
self.pinned_action.copy_(torch.from_numpy(action))
if self.per:
self.pinned_weights.copy_(torch.from_numpy(weights))
if self.use_cuda:
# Use asychronous transfer. The order is important, start with the first tensor we will need to use.
cuda_next_state = self.pinned_next_state.cuda(non_blocking=True)
cuda_state = self.pinned_state.cuda(non_blocking=True)
cuda_reward = self.pinned_reward.cuda(non_blocking=True)
cuda_done = self.pinned_done.cuda(non_blocking=True)
cuda_action = self.pinned_action.cuda(non_blocking=True)
if self.per:
cuda_weights = self.pinned_weights.cuda(non_blocking=True)
else:
cuda_next_state = self.pinned_next_state
cuda_state = self.pinned_state
cuda_reward = self.pinned_reward
cuda_done = self.pinned_done
cuda_action = self.pinned_action
if self.per:
cuda_weights = self.pinned_weights
if self.per:
return cuda_state, cuda_next_state, cuda_action, cuda_reward, cuda_done, cuda_weights
else:
return cuda_state, cuda_next_state, cuda_reward, cuda_reward, cuda_done
def update_target(current_model, target_model):
target_model.load_state_dict(current_model.state_dict())
def plot(frame_idx, rewards, losses, prefix='.'):
clear_output(True)
plt.figure(figsize=(20,5))
plt.subplot(131)
plt.title('frame %s. reward: %s' % (frame_idx, np.mean(rewards[-10:])))
plt.plot(rewards)
plt.subplot(132)
plt.title('loss')
plt.plot(losses)
plt.savefig('{}/rewards_losses_so_far.pdf'.format(prefix))
np.save('{}/frame_{}_losses.npy'.format(prefix, frame_idx), losses)
np.save('{}/frame_{}_rewards.npy'.format(prefix, frame_idx), rewards)
plt.close('all')
def test_plot(model_frame, frame_idx, rewards, prefix='.'):
clear_output(True)
plt.figure(figsize=(15,5))
plt.subplot(121)
plt.title('frame %s. reward: %s' % (frame_idx, np.mean(rewards[-10:])))
plt.plot(rewards)
plt.savefig('{}/model_frame_{}_test_frame_{}.pdf'.format(prefix, model_frame, frame_idx))
plt.close('all')
def torch_arctanh(x, eps=1e-6):
x *= (1 - eps)
return (torch.log((1 + x) / (1 - x))) * 0.5
def tanh_rescale(x, x_min=-1., x_max=1.):
return (torch.tanh(x)) * 0.5 * (x_max - x_min) + (x_max + x_min) * 0.5
def arctanh_rescale(y, x_min=-1., x_max=1.):
return torch_arctanh((2*y-x_max-x_min)/(x_max-x_min))
def to_one_hot(y, num_classes):
"""
Take a batch of label y with n dims and convert it to
1-hot representation with n+1 dims.
Link: https://discuss.pytorch.org/t/convert-int-into-one-hot-format/507/24
"""
# y = y.detach().clone().view(-1, 1)
# y_onehot = y.new_zeros((y.size()[0], num_classes)).scatter_(1, y, 1)
y_onehot = torch.FloatTensor(1, num_classes)
y_onehot.zero_()
y_onehot.scatter_(1, torch.tensor([[y]]), 1)
return Variable(y_onehot)
def get_acrobot_eps(eps):
return eps * torch.Tensor(ACROBOT_STD)