-
Notifications
You must be signed in to change notification settings - Fork 8
/
main.py
91 lines (75 loc) · 2.8 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gym
import torch
import argparse
import numpy as np
import torch.optim as optim
from model import Actor, Critic
from utils import get_action
from collections import deque
from running_state import ZFilter
from hparams import HyperParams as hp
parser = argparse.ArgumentParser()
parser.add_argument('--algorithm', type=str, default='PPO',
help='select one of algorithms among Vanilla_PG, NPG, TPRO, PPO')
parser.add_argument('--env', type=str, default="Hopper-v2",
help='name of Mujoco environement')
parser.add_argument('--render', default=False)
args = parser.parse_args()
if args.algorithm == "PG":
from vanila_pg import train_model
elif args.algorithm == "NPG":
from npg import train_model
elif args.algorithm == "TRPO":
from trpo import train_model
elif args.algorithm == "PPO":
from ppo import train_model
if __name__=="__main__":
# you can choose other environments.
# possible environments: Ant-v2, HalfCheetah-v2, Hopper-v2, Humanoid-v2,
# HumanoidStandup-v2, InvertedPendulum-v2, Reacher-v2, Swimmer-v2, Walker2d-v2
env = gym.make(args.env)
env.seed(500)
torch.manual_seed(500)
num_inputs = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
print('state size:', num_inputs)
print('action size:', num_actions)
actor = Actor(num_inputs, num_actions)
critic = Critic(num_inputs)
actor_optim = optim.Adam(actor.parameters(), lr=hp.actor_lr)
critic_optim = optim.Adam(critic.parameters(), lr=hp.critic_lr,
weight_decay=hp.l2_rate)
running_state = ZFilter((num_inputs,), clip=5)
episodes = 0
for iter in range(15000):
actor.eval(), critic.eval()
memory = deque()
steps = 0
scores = []
while steps < 2048:
episodes += 1
state = env.reset()
state = running_state(state)
score = 0
for _ in range(10000):
if episodes % 50 == 0:
env.render()
steps += 1
mu, std, _ = actor(torch.Tensor(state).unsqueeze(0))
action = get_action(mu, std)[0]
next_state, reward, done, _ = env.step(action)
next_state = running_state(next_state)
if done:
mask = 0
else:
mask = 1
memory.append([state, action, reward, mask])
score += reward
state = next_state
if done:
break
scores.append(score)
score_avg = np.mean(scores)
print('{} episode score is {:.2f}'.format(episodes, score_avg))
actor.train(), critic.train()
train_model(actor, critic, memory, actor_optim, critic_optim)