DQN_Agent.py

# -*- coding: utf-8 -*-
"""DQN_Agent.py

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/12dYlWkf6l_rophZ5cfo6xVUcMmsM7N3H
"""

# Commented out IPython magic to ensure Python compatibility.
#DQN Model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# %matplotlib inline

import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
import torch.optim as optim

import yfinance as yf
from collections import deque
import random
import math
from tqdm import tqdm

class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 8)
        self.fc4 = nn.Linear(8, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        output = torch.softmax(self.fc4(x), dim=-1)
        return output


# Main Agent Work

class DQN_Agent:
    def __init__(self, state_dim, is_eval=False, model_name=""):
        self.model_type = "DQN"
        self.state_dim = state_dim
        self.action_dim = 3  # hold, sell, and buy
        self.memory = deque(maxlen=100)
        self.buffer_size = 60

        self.gamma = 0.95
        self.epsilon = 1.0  # initial exploration rate
        self.epsilon_min = 0.01  # minimum exploration rate
        self.epsilon_decay = 0.995  # decrease exploration rate as the agent becomes good at trading
        self.is_eval = is_eval
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # self.model = torch.load(f'{model_name}.pth') if is_eval else self.create_model().to(self.device)
        # Load the model with proper device mapping
        if is_eval:
            self.model = self.create_model().to(self.device)

            if self.device.type == 'cpu':
                self.model.load_state_dict(torch.load(f'{model_name}.pth', map_location=torch.device('cpu')))

            else:
                self.model.load_state_dict(torch.load(f'{model_name}.pth'))
            self.model.eval()
        else:
            self.model = self.create_model().to(self.device)


        self.optimizer = optim.Adam(self.model.parameters(), lr=0.01)
        self.loss_fn = nn.MSELoss()

    def create_model(self):
        return DQN(self.state_dim, self.action_dim)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if not self.is_eval and np.random.rand() <= self.epsilon:
            return random.randrange(self.action_dim)

        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
        with torch.no_grad():
            options = self.model(state)
        return torch.argmax(options[0]).item()

    def experience_replay(self, batch_size):
        mini_batch = random.sample(self.memory, min(len(self.memory), batch_size))

        for state, action, reward, next_state, done in mini_batch:
            state = torch.tensor(state, dtype=torch.float32).to(self.device)
            next_state = torch.tensor(next_state, dtype=torch.float32).to(self.device)

            target = reward
            if not done:
                target = reward + self.gamma * torch.max(self.model(next_state)).item()

            # Predict Q-values for the current state
            target_f = self.model(state).detach().cpu().numpy()

            # Ensure target_f is a 1D array and update the action index
            target_f = target_f.squeeze()  # Remove any extra dimensions
            target_f[action] = target

            target_f = torch.tensor(target_f, dtype=torch.float32).to(self.device).unsqueeze(0)
            state = torch.tensor(state, dtype=torch.float32).to(self.device)

            self.optimizer.zero_grad()
            loss = nn.MSELoss()(self.model(state), target_f)
            loss.backward()
            self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        return loss.item()