Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parallel-envs-friendly ppo_continuous_action.py #348

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 18 additions & 18 deletions cleanrl/ppo_continuous_action.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,37 +35,37 @@ def parse_args():
help="whether to capture videos of the agent performances (check out `videos` folder)")

# Algorithm specific arguments
parser.add_argument("--env-id", type=str, default="HalfCheetah-v4",
parser.add_argument("--env-id", type=str, default="Ant-v4",
help="the id of the environment")
parser.add_argument("--total-timesteps", type=int, default=1000000,
parser.add_argument("--total-timesteps", type=int, default=10000000,
help="total timesteps of the experiments")
parser.add_argument("--learning-rate", type=float, default=3e-4,
parser.add_argument("--learning-rate", type=float, default=0.00295,
help="the learning rate of the optimizer")
parser.add_argument("--num-envs", type=int, default=1,
parser.add_argument("--num-envs", type=int, default=64,
help="the number of parallel game environments")
parser.add_argument("--num-steps", type=int, default=2048,
parser.add_argument("--num-steps", type=int, default=64,
help="the number of steps to run in each environment per policy rollout")
parser.add_argument("--anneal-lr", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="Toggle learning rate annealing for policy and value networks")
parser.add_argument("--gamma", type=float, default=0.99,
help="the discount factor gamma")
parser.add_argument("--gae-lambda", type=float, default=0.95,
help="the lambda for the general advantage estimation")
parser.add_argument("--num-minibatches", type=int, default=32,
parser.add_argument("--num-minibatches", type=int, default=4,
help="the number of mini-batches")
parser.add_argument("--update-epochs", type=int, default=10,
parser.add_argument("--update-epochs", type=int, default=2,
help="the K epochs to update the policy")
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="Toggles advantages normalization")
parser.add_argument("--clip-coef", type=float, default=0.2,
help="the surrogate clipping coefficient")
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
parser.add_argument("--ent-coef", type=float, default=0.0,
help="coefficient of the entropy")
parser.add_argument("--vf-coef", type=float, default=0.5,
parser.add_argument("--vf-coef", type=float, default=1.3,
help="coefficient of the value function")
parser.add_argument("--max-grad-norm", type=float, default=0.5,
parser.add_argument("--max-grad-norm", type=float, default=3.5,
help="the maximum norm for the gradient clipping")
parser.add_argument("--target-kl", type=float, default=None,
help="the target KL divergence threshold")
Expand All @@ -76,7 +76,7 @@ def parse_args():
return args


def make_env(env_id, idx, capture_video, run_name, gamma):
def make_env(env_id, idx, capture_video, run_name):
def thunk():
if capture_video:
env = gym.make(env_id, render_mode="rgb_array")
Expand All @@ -87,11 +87,6 @@ def thunk():
if capture_video:
if idx == 0:
env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
env = gym.wrappers.ClipAction(env)
env = gym.wrappers.NormalizeObservation(env)
env = gym.wrappers.TransformObservation(env, lambda obs: np.clip(obs, -10, 10))
env = gym.wrappers.NormalizeReward(env, gamma=gamma)
env = gym.wrappers.TransformReward(env, lambda reward: np.clip(reward, -10, 10))
return env

return thunk
Expand Down Expand Up @@ -165,9 +160,14 @@ def get_action_and_value(self, x, action=None):
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")

# env setup
envs = gym.vector.SyncVectorEnv(
[make_env(args.env_id, i, args.capture_video, run_name, args.gamma) for i in range(args.num_envs)]
envs = gym.vector.AsyncVectorEnv(
[make_env(args.env_id, i, args.capture_video, run_name) for i in range(args.num_envs)]
)
envs = gym.wrappers.ClipAction(envs)
envs = gym.wrappers.NormalizeObservation(envs)
envs = gym.wrappers.TransformObservation(envs, lambda obs: np.clip(obs, -10, 10))
envs = gym.wrappers.NormalizeReward(envs, gamma=args.gamma)
envs = gym.wrappers.TransformReward(envs, lambda reward: np.clip(reward, -10, 10))
assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"

agent = Agent(envs).to(device)
Expand Down
Loading