Skip to content

Commit

Permalink
Default num_envs to 1
Browse files Browse the repository at this point in the history
  • Loading branch information
noahfarr committed Aug 28, 2024
1 parent d348e9f commit 180e302
Showing 1 changed file with 65 additions and 24 deletions.
89 changes: 65 additions & 24 deletions cleanrl/td3_continuous_action.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class Args:
"""total timesteps of the experiments"""
learning_rate: float = 3e-4
"""the learning rate of the optimizer"""
num_envs: int = 2
num_envs: int = 1
"""the number of parallel game environments"""
buffer_size: int = int(1e6)
"""the replay memory buffer size"""
Expand Down Expand Up @@ -88,7 +88,8 @@ class QNetwork(nn.Module):
def __init__(self, env):
super().__init__()
self.fc1 = nn.Linear(
np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape),
np.array(env.single_observation_space.shape).prod()
+ np.prod(env.single_action_space.shape),
256,
)
self.fc2 = nn.Linear(256, 256)
Expand Down Expand Up @@ -158,7 +159,8 @@ def forward(self, x):
writer = SummaryWriter(f"runs/{run_name}")
writer.add_text(
"hyperparameters",
"|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
"|param|value|\n|-|-|\n%s"
% ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
)

# TRY NOT TO MODIFY: seeding
Expand All @@ -171,9 +173,14 @@ def forward(self, x):

# env setup
envs = gym.vector.SyncVectorEnv(
[make_env(args.env_id, args.seed + i, i, args.capture_video, run_name) for i in range(args.num_envs)]
[
make_env(args.env_id, args.seed + i, i, args.capture_video, run_name)
for i in range(args.num_envs)
]
)
assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"
assert isinstance(
envs.single_action_space, gym.spaces.Box
), "only continuous action space is supported"

actor = Actor(envs).to(device)
qf1 = QNetwork(envs).to(device)
Expand All @@ -184,7 +191,9 @@ def forward(self, x):
target_actor.load_state_dict(actor.state_dict())
qf1_target.load_state_dict(qf1.state_dict())
qf2_target.load_state_dict(qf2.state_dict())
q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.learning_rate)
q_optimizer = optim.Adam(
list(qf1.parameters()) + list(qf2.parameters()), lr=args.learning_rate
)
actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.learning_rate)

envs.single_observation_space.dtype = np.float32
Expand All @@ -203,12 +212,18 @@ def forward(self, x):
for global_step in range(args.total_timesteps):
# ALGO LOGIC: put action logic here
if global_step < args.learning_starts:
actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
actions = np.array(
[envs.single_action_space.sample() for _ in range(envs.num_envs)]
)
else:
with torch.no_grad():
actions = actor(torch.Tensor(obs).to(device))
actions += torch.normal(0, actor.action_scale * args.exploration_noise)
actions = actions.cpu().numpy().clip(envs.single_action_space.low, envs.single_action_space.high)
actions = (
actions.cpu()
.numpy()
.clip(envs.single_action_space.low, envs.single_action_space.high)
)

# TRY NOT TO MODIFY: execute the game and log data.
next_obs, rewards, terminations, truncations, infos = envs.step(actions)
Expand All @@ -217,9 +232,15 @@ def forward(self, x):
if "final_info" in infos:
for info in infos["final_info"]:
if info is not None:
print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
print(
f"global_step={global_step}, episodic_return={info['episode']['r']}"
)
writer.add_scalar(
"charts/episodic_return", info["episode"]["r"], global_step
)
writer.add_scalar(
"charts/episodic_length", info["episode"]["l"], global_step
)
break

# TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation`
Expand All @@ -236,17 +257,21 @@ def forward(self, x):
if global_step > args.learning_starts:
data = rb.sample(args.batch_size)
with torch.no_grad():
clipped_noise = (torch.randn_like(data.actions, device=device) * args.policy_noise).clamp(
-args.noise_clip, args.noise_clip
) * target_actor.action_scale
clipped_noise = (
torch.randn_like(data.actions, device=device) * args.policy_noise
).clamp(-args.noise_clip, args.noise_clip) * target_actor.action_scale

next_state_actions = (target_actor(data.next_observations) + clipped_noise).clamp(
next_state_actions = (
target_actor(data.next_observations) + clipped_noise
).clamp(
envs.single_action_space.low[0], envs.single_action_space.high[0]
)
qf1_next_target = qf1_target(data.next_observations, next_state_actions)
qf2_next_target = qf2_target(data.next_observations, next_state_actions)
min_qf_next_target = torch.min(qf1_next_target, qf2_next_target)
next_q_value = data.rewards.flatten() + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1)
next_q_value = data.rewards.flatten() + (
1 - data.dones.flatten()
) * args.gamma * (min_qf_next_target).view(-1)

qf1_a_values = qf1(data.observations, data.actions).view(-1)
qf2_a_values = qf2(data.observations, data.actions).view(-1)
Expand All @@ -266,16 +291,32 @@ def forward(self, x):
actor_optimizer.step()

# update the target network
for param, target_param in zip(actor.parameters(), target_actor.parameters()):
target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
for param, target_param in zip(qf1.parameters(), qf1_target.parameters()):
target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
for param, target_param in zip(qf2.parameters(), qf2_target.parameters()):
target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
for param, target_param in zip(
actor.parameters(), target_actor.parameters()
):
target_param.data.copy_(
args.tau * param.data + (1 - args.tau) * target_param.data
)
for param, target_param in zip(
qf1.parameters(), qf1_target.parameters()
):
target_param.data.copy_(
args.tau * param.data + (1 - args.tau) * target_param.data
)
for param, target_param in zip(
qf2.parameters(), qf2_target.parameters()
):
target_param.data.copy_(
args.tau * param.data + (1 - args.tau) * target_param.data
)

if global_step % 100 == 0:
writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step)
writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step)
writer.add_scalar(
"losses/qf1_values", qf1_a_values.mean().item(), global_step
)
writer.add_scalar(
"losses/qf2_values", qf2_a_values.mean().item(), global_step
)
writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step)
writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step)
writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step)
Expand Down

0 comments on commit 180e302

Please sign in to comment.