-
Notifications
You must be signed in to change notification settings - Fork 0
/
DQN_Agent.py
124 lines (95 loc) · 4.13 KB
/
DQN_Agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# -*- coding: utf-8 -*-
"""DQN_Agent.py
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/12dYlWkf6l_rophZ5cfo6xVUcMmsM7N3H
"""
# Commented out IPython magic to ensure Python compatibility.
#DQN Model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# %matplotlib inline
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
import torch.optim as optim
import yfinance as yf
from collections import deque
import random
import math
from tqdm import tqdm
class DQN(nn.Module):
def __init__(self, state_dim, action_dim):
super(DQN, self).__init__()
self.fc1 = nn.Linear(state_dim, 64)
self.fc2 = nn.Linear(64, 32)
self.fc3 = nn.Linear(32, 8)
self.fc4 = nn.Linear(8, action_dim)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
x = torch.relu(self.fc3(x))
output = torch.softmax(self.fc4(x), dim=-1)
return output
# Main Agent Work
class DQN_Agent:
def __init__(self, state_dim, is_eval=False, model_name=""):
self.model_type = "DQN"
self.state_dim = state_dim
self.action_dim = 3 # hold, sell, and buy
self.memory = deque(maxlen=100)
self.buffer_size = 60
self.gamma = 0.95
self.epsilon = 1.0 # initial exploration rate
self.epsilon_min = 0.01 # minimum exploration rate
self.epsilon_decay = 0.995 # decrease exploration rate as the agent becomes good at trading
self.is_eval = is_eval
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# self.model = torch.load(f'{model_name}.pth') if is_eval else self.create_model().to(self.device)
# Load the model with proper device mapping
if is_eval:
self.model = self.create_model().to(self.device)
if self.device.type == 'cpu':
self.model.load_state_dict(torch.load(f'{model_name}.pth', map_location=torch.device('cpu')))
else:
self.model.load_state_dict(torch.load(f'{model_name}.pth'))
self.model.eval()
else:
self.model = self.create_model().to(self.device)
self.optimizer = optim.Adam(self.model.parameters(), lr=0.01)
self.loss_fn = nn.MSELoss()
def create_model(self):
return DQN(self.state_dim, self.action_dim)
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def act(self, state):
if not self.is_eval and np.random.rand() <= self.epsilon:
return random.randrange(self.action_dim)
state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
with torch.no_grad():
options = self.model(state)
return torch.argmax(options[0]).item()
def experience_replay(self, batch_size):
mini_batch = random.sample(self.memory, min(len(self.memory), batch_size))
for state, action, reward, next_state, done in mini_batch:
state = torch.tensor(state, dtype=torch.float32).to(self.device)
next_state = torch.tensor(next_state, dtype=torch.float32).to(self.device)
target = reward
if not done:
target = reward + self.gamma * torch.max(self.model(next_state)).item()
# Predict Q-values for the current state
target_f = self.model(state).detach().cpu().numpy()
# Ensure target_f is a 1D array and update the action index
target_f = target_f.squeeze() # Remove any extra dimensions
target_f[action] = target
target_f = torch.tensor(target_f, dtype=torch.float32).to(self.device).unsqueeze(0)
state = torch.tensor(state, dtype=torch.float32).to(self.device)
self.optimizer.zero_grad()
loss = nn.MSELoss()(self.model(state), target_f)
loss.backward()
self.optimizer.step()
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
return loss.item()