Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Adding MultiAgent Utilities #323

Open
wants to merge 46 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
1d49049
Single actor critic shared params
hades-rp2010 Sep 1, 2020
ef4a179
Shared layers for multi ACs
hades-rp2010 Sep 1, 2020
2ecd086
Merge branch 'master' of https://github.com/SforAiDl/genrl
hades-rp2010 Sep 1, 2020
53450a8
Fix lint errors (1)
hades-rp2010 Sep 1, 2020
274aff9
Fixed tests
hades-rp2010 Sep 1, 2020
38f95f0
Changes to dicstrings and classes
hades-rp2010 Sep 2, 2020
0927001
adding MultiAgentBuffer
AdityaKapoor74 Sep 3, 2020
daa8b2a
shared mlp
AdityaKapoor74 Sep 3, 2020
44db72e
adding changes
AdityaKapoor74 Sep 3, 2020
4ef8f48
new mlp for maddpg
AdityaKapoor74 Sep 3, 2020
d8cf1a9
adding environment loader
AdityaKapoor74 Sep 3, 2020
8d2cf06
Adding Actor and Critic classes
AdityaKapoor74 Sep 3, 2020
1365585
adding new functionalities
AdityaKapoor74 Sep 3, 2020
5067e42
minor changes
AdityaKapoor74 Sep 3, 2020
6f0563e
added return statement to mlp_
AdityaKapoor74 Sep 3, 2020
5061abe
rectifying
AdityaKapoor74 Sep 4, 2020
e6a378c
rectifying 2
AdityaKapoor74 Sep 4, 2020
915d19d
rectifying 3
AdityaKapoor74 Sep 4, 2020
b0b5025
adding test for mlp_concat
AdityaKapoor74 Sep 4, 2020
8cc732b
adding test for mlp_concat
AdityaKapoor74 Sep 4, 2020
b8f7f6a
fixing errors
AdityaKapoor74 Sep 4, 2020
e50e230
adding docstring
AdityaKapoor74 Sep 4, 2020
835819e
Renaming Multi -> Two and comments
hades-rp2010 Sep 4, 2020
793c045
changing names
AdityaKapoor74 Sep 5, 2020
2635fd5
changing names
AdityaKapoor74 Sep 5, 2020
cd87506
Merge branch 'master' of https://github.com/AdityaKapoor74/genrl into…
hades-rp2010 Sep 5, 2020
65b6520
Shared params for single ACs
hades-rp2010 Sep 5, 2020
3d01b85
Merge branch 'multiagentutils' into shared
hades-rp2010 Sep 5, 2020
a62c100
Merge pull request #1 from hades-rp2010/shared
AdityaKapoor74 Oct 4, 2020
841ff66
Merge branch 'master' into multiagentutils
AdityaKapoor74 Oct 4, 2020
2be8df5
rollout buffer for MA
AdityaKapoor74 Oct 4, 2020
ac9b5a8
Merge branch 'multiagentutils' of https://github.com/AdityaKapoor74/g…
AdityaKapoor74 Oct 4, 2020
10282f0
Update genrl/utils/utils.py
AdityaKapoor74 Oct 4, 2020
79b531b
Update genrl/agents/deep/ppo1/ppo1.py
AdityaKapoor74 Oct 4, 2020
a3885a0
Update genrl/core/actor_critic.py
AdityaKapoor74 Oct 4, 2020
e3dc677
Update genrl/core/actor_critic.py
AdityaKapoor74 Oct 4, 2020
4c2ad51
Update genrl/core/actor_critic.py
AdityaKapoor74 Oct 4, 2020
43554e4
Update genrl/core/actor_critic.py
AdityaKapoor74 Oct 4, 2020
6828e93
removing SharedAC class
AdityaKapoor74 Oct 4, 2020
eac920c
removing SharedAC class
AdityaKapoor74 Oct 4, 2020
194065f
rectify
AdityaKapoor74 Oct 4, 2020
c0198bc
rectify
AdityaKapoor74 Oct 4, 2020
fe40835
rectify
AdityaKapoor74 Oct 4, 2020
a50204a
rectifying
AdityaKapoor74 Oct 4, 2020
4a3cd74
removing unecessary code
AdityaKapoor74 Oct 4, 2020
602a7b5
removing unecessary code
AdityaKapoor74 Oct 4, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 159 additions & 1 deletion genrl/core/actor_critic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from genrl.core.base import BaseActorCritic
from genrl.core.policies import MlpPolicy
from genrl.core.values import MlpValue
from genrl.utils.utils import cnn
from genrl.utils.utils import cnn, shared_mlp


class MlpActorCritic(BaseActorCritic):
Expand Down Expand Up @@ -216,10 +216,168 @@ def get_value(self, inp: torch.Tensor) -> torch.Tensor:
return value


class SharedActorCritic(BaseActorCritic):
AdityaKapoor74 marked this conversation as resolved.
Show resolved Hide resolved
def __init__(
self,
critic_prev,
actor_prev,
shared,
critic_post,
actor_post,
weight_init,
activation_func
):
super(SharedActorCritic, self).__init__()

self.critic,self.actor = shared_mlp(critic_prev,actor_prev,shared,critic_post,actor_post,weight_init,activation_func)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def forward(self, state_critic,state_action):

if state_critic is not None:
return self.critic(state_critic)

if state_action is not None:
return self.actor(state_action)



AdityaKapoor74 marked this conversation as resolved.
Show resolved Hide resolved
def get_action(self, state, one_hot=False, deterministic=False):
# state = torch.FloatTensor(state).to(self.device)
logits = self.forward(None,state)
if one_hot:
if deterministic:
logits = self.onehot_from_logits(logits,eps=1.0)
else:
logits = self.onehot_from_logits(logits,eps=0.0)
return logits

dist = F.softmax(logits, dim=0)
probs = Categorical(dist)
if deterministic:
index = torch.argmax(probs)
else:
index = probs.sample().cpu().detach().item()
return index

def onehot_from_logits(self, logits, eps=0.0):
# get best (according to current policy) actions in one-hot form
argmax_acs = (logits == logits.max(0, keepdim=True)[0]).float()
if eps == 0.0:
return argmax_acs
# get random actions in one-hot form
rand_acs = torch.eye(logits.shape[1])[
[np.random.choice(range(logits.shape[1]), size=logits.shape[0])]
]
# chooses between best and random actions using epsilon greedy
return torch.stack(
[
argmax_acs[i] if r > eps else rand_acs[i]
for i, r in enumerate(torch.rand(logits.shape[0]))
]
)

def get_value(self, state):
# state = torch.FloatTensor(state).to(self.device)
value = self.forward(state,None)
return value



class Actor(BaseActorCritic):
AdityaKapoor74 marked this conversation as resolved.
Show resolved Hide resolved
def __init__(
self,
state_dim: spaces.Space,
action_dim: spaces.Space,
policy_layers: Tuple = (32, 32),
discrete: bool = True,
**kwargs,
):
def __init__(self, layer_sizes,weight_init,activation_func):
super(Actor, self).__init__()

self.actor = MlpPolicy(layer, action_dim, policy_layers, discrete, **kwargs)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def forward(self, policy):
policy = self.actor(policy)
return policy



def get_action(self, state, one_hot=False, deterministic=False):
# state = torch.FloatTensor(state).to(self.device)
logits = self.forward(state)
if one_hot:
if deterministic:
logits = self.onehot_from_logits(logits,eps=1.0)
else:
logits = self.onehot_from_logits(logits,eps=0.0)
return logits

dist = F.softmax(logits, dim=0)
probs = Categorical(dist)
if deterministic:
index = torch.argmax(probs)
else:
index = probs.sample().cpu().detach().item()
return index

def onehot_from_logits(self, logits, eps=0.0):
# get best (according to current policy) actions in one-hot form
argmax_acs = (logits == logits.max(0, keepdim=True)[0]).float()
if eps == 0.0:
return argmax_acs
# get random actions in one-hot form
rand_acs = torch.eye(logits.shape[1])[
[np.random.choice(range(logits.shape[1]), size=logits.shape[0])]
]
# chooses between best and random actions using epsilon greedy
return torch.stack(
[
argmax_acs[i] if r > eps else rand_acs[i]
for i, r in enumerate(torch.rand(logits.shape[0]))
]
)


class Critic(BaseActorCritic):
def __init__(
self,
state_dim: spaces.Space,
action_dim: spaces.Space,
policy_layers: Tuple = (32, 32),
value_layers: Tuple = (32, 32),
val_type: str = "V",
discrete: bool = True,
**kwargs,
):
super(MlpActorCritic, self).__init__()

self.critic = MlpValue(state_dim, action_dim, val_type, value_layers, **kwargs)

self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def forward(self, value):

value = self.critic(value)

return value



def get_value(self, state):
# state = torch.FloatTensor(state).to(self.device)
value = self.forward(state)
return value


actor_critic_registry = {
"mlp": MlpActorCritic,
"cnn": CNNActorCritic,
"mlp12": MlpSingleActorMultiCritic,
"mlpshared": SharedActorCritic,
}


Expand Down
87 changes: 87 additions & 0 deletions genrl/core/buffers.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@ class PrioritizedReplayBufferSamples(NamedTuple):
indices: torch.Tensor
weights: torch.Tensor

class MultiAgentReplayBuffer(NamedTuple):
states: torch.Tensor
actions: torch.Tensor
rewards: torch.Tensor
next_states: torch.Tensor
dones: torch.Tensor


class ReplayBuffer:
"""
Expand Down Expand Up @@ -181,3 +188,83 @@ def __len__(self) -> int:
@property
def pos(self):
return len(self.buffer)



class MultiAgentReplayBuffer:
"""
Implements the basic Experience Replay Mechanism for MultiAgents by feeding in global states,
global actions, global rewards, global next_states, global dones

:param capacity: Size of the replay buffer
:type capacity: int
:param num_agents: Number of agents in the environment
:type num_agents: int
"""
def __init__(self, num_agents, capacity):
self.capacity = capacity
self.num_agents = num_agents
self.buffer = deque(maxlen=max_size)

def push(self, inp: Tuple) -> None:
"""
Adds new experience to buffer

:param inp: (Tuple containing `state`, `action`, `reward`,
`next_state` and `done`)
:type inp: tuple
:returns: None
"""
self.buffer.append(inp)


def sample(self, batch_size):

"""
Returns randomly sampled experiences from replay memory

:param batch_size: Number of samples per batch
:type batch_size: int
:returns: (Tuple composing of `indiv_obs_batch`, `indiv_action_batch`, `indiv_reward_batch`, `indiv_next_obs_batch`,
`global_state_batch`, `global_actions_batch`, `global_next_state_batch`, `done_batch`)
"""
indiv_obs_batch = [[] for _ in range(self.num_agents)] # [ [states of agent 1], ... ,[states of agent n] ] ]
indiv_action_batch = [[] for _ in range(self.num_agents)] # [ [actions of agent 1], ... , [actions of agent n]]
indiv_reward_batch = [[] for _ in range(self.num_agents)]
indiv_next_obs_batch = [[] for _ in range(self.num_agents)]

global_state_batch = []
global_next_state_batch = []
global_actions_batch = []
done_batch = []

batch = random.sample(self.buffer, batch_size)


for experience in batch:
state, action, reward, next_state, done = experience

for i in range(self.num_agents):
indiv_obs_batch[i].append(state[i])
indiv_action_batch[i].append(action[i])
indiv_reward_batch[i].append(reward[i])
indiv_next_obs_batch[i].append(next_state[i])

global_state_batch.append(torch.cat(state))
global_actions_batch.append(torch.cat(action))
global_next_state_batch.append(torch.cat(next_state))
done_batch.append(done)

global_state_batch = torch.stack(global_state_batch)
global_actions_batch = torch.stack(global_actions_batch)
global_next_state_batch = torch.stack(global_next_state_batch)
done_batch = torch.stack(done_batch)
indiv_obs_batch = torch.stack([torch.FloatTensor(obs) for obs in indiv_obs_batch])
indiv_action_batch = torch.stack([torch.FloatTensor(act) for act in indiv_action_batch])
indiv_reward_batch = torch.stack([torch.FloatTensor(rew) for rew in indiv_reward_batch])
indiv_next_obs_batch = torch.stack([torch.FloatTensor(next_obs) for next_obs in indiv_next_obs_batch])

return indiv_obs_batch, indiv_action_batch, indiv_reward_batch, indiv_next_obs_batch, global_state_batch, global_actions_batch, global_next_state_batch, done_batch

def __len__(self):
return len(self.buffer)
104 changes: 104 additions & 0 deletions genrl/environments/gym_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,107 @@ def close(self) -> None:
Closes environment
"""
self.env.close()


class MultiGymWrapper(gym.Wrapper):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whats the difference between this and a normal Wrapper? Cant you use a TorchWrapper instead of creating this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just a template, I haven't added anything yet

"""
Wrapper class for all MultiAgent Particle Environments

:param env: Gym environment name
:param n_envs: Number of environments. None if not vectorised
:param parallel: If vectorised, should environments be run through \
serially or parallelly
:type env: string
:type n_envs: None, int
:type parallel: boolean
"""

def __init__(self, env: gym.Env):
super(GymWrapper, self).__init__(env)
self.env = env

self.observation_space = self.env.observation_space
self.action_space = self.env.action_space

self.state = None
self.action = None
self.reward = None
self.done = False
self.info = {}

def __getattr__(self, name: str) -> Any:
"""
All other calls would go to base env
"""
env = super(GymWrapper, self).__getattribute__("env")
return getattr(env, name)

@property
def obs_shape(self):
if isinstance(self.env.observation_space, gym.spaces.Discrete):
obs_shape = (1,)
elif isinstance(self.env.observation_space, gym.spaces.Box):
obs_shape = self.env.observation_space.shape
return obs_shape

@property
def action_shape(self):
if isinstance(self.env.action_space, gym.spaces.Box):
action_shape = self.env.action_space.shape
elif isinstance(self.env.action_space, gym.spaces.Discrete):
action_shape = (1,)
return action_shape

def sample(self) -> np.ndarray:
"""
Shortcut method to directly sample from environment's action space

:returns: Random action from action space
:rtype: NumPy Array
"""
return self.env.action_space.sample()

def render(self, mode: str = "human") -> None:
"""
Renders all envs in a tiles format similar to baselines.

:param mode: Can either be 'human' or 'rgb_array'. \
Displays tiled images in 'human' and returns tiled images in 'rgb_array'
:type mode: string
"""
self.env.render(mode=mode)

def seed(self, seed: int = None) -> None:
"""
Set environment seed

:param seed: Value of seed
:type seed: int
"""
self.env.seed(seed)

def step(self, action: np.ndarray) -> np.ndarray:
"""
Steps the env through given action

:param action: Action taken by agent
:type action: NumPy array
:returns: Next observation, reward, game status and debugging info
"""
self.state, self.reward, self.done, self.info = self.env.step(action)
self.action = action
return self.state, self.reward, self.done, self.info

def reset(self) -> np.ndarray:
"""
Resets environment

:returns: Initial state
"""
return self.env.reset()

def close(self) -> None:
"""
Closes environment
"""
self.env.close()
Loading