Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/v4-dev' into a2c-vec-act
Browse files Browse the repository at this point in the history
  • Loading branch information
kengz committed Apr 21, 2019
2 parents d823efe + 755429a commit 93b3efc
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 34 deletions.
16 changes: 9 additions & 7 deletions slm_lab/env/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,28 +52,30 @@ def get_base_clock(self):
@lab_api
def reset(self):
logger.debug3('EnvSpace.reset')
_reward_v, state_v, done_v = self.aeb_space.init_data_v(ENV_DATA_NAMES)
state_v, _reward_v, done_v = self.aeb_space.init_data_v(ENV_DATA_NAMES)
for env in self.envs:
_reward_e, state_e, done_e = env.space_reset()
state_v[env.e, 0:len(state_e)] = state_e
done_v[env.e, 0:len(done_e)] = done_e
_reward_space, state_space, done_space = self.aeb_space.add(ENV_DATA_NAMES, (_reward_v, state_v, done_v))
state_space, _reward_space, done_space = self.aeb_space.add(ENV_DATA_NAMES, (state_v, _reward_v, done_v))
logger.debug3(f'\nstate_space: {state_space}')
return _reward_space, state_space, done_space

@lab_api
def step(self, action_space):
reward_v, state_v, done_v = self.aeb_space.init_data_v(ENV_DATA_NAMES)
state_v, reward_v, done_v = self.aeb_space.init_data_v(ENV_DATA_NAMES)
info_v = []
for env in self.envs:
e = env.e
action_e = action_space.get(e=e)
reward_e, state_e, done_e = env.space_step(action_e)
state_e, reward_e, done_e, info_e = env.space_step(action_e)
reward_v[e, 0:len(reward_e)] = reward_e
state_v[e, 0:len(state_e)] = state_e
done_v[e, 0:len(done_e)] = done_e
reward_space, state_space, done_space = self.aeb_space.add(ENV_DATA_NAMES, (reward_v, state_v, done_v))
logger.debug3(f'\nreward_space: {reward_space}\nstate_space: {state_space}\ndone_space: {done_space}')
return reward_space, state_space, done_space
info_v.append(info_e)
state_space, reward_space, done_space = self.aeb_space.add(ENV_DATA_NAMES, (state_v, reward_v, done_v))
logger.debug3(f'\nstate_space: {state_space}\nreward_space: {reward_space}\ndone_space: {done_space}')
return state_space, reward_space, done_space, info_v

@lab_api
def close(self):
Expand Down
6 changes: 3 additions & 3 deletions slm_lab/env/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import numpy as np
import time

ENV_DATA_NAMES = ['reward', 'state', 'done']
ENV_DATA_NAMES = ['state', 'reward', 'done']
NUM_EVAL_EPI = 100 # set the number of episodes to eval a model ckpt
logger = logger.get_logger(__name__)

Expand Down Expand Up @@ -162,7 +162,7 @@ def reset(self):
@abstractmethod
@lab_api
def step(self, action):
'''Step method, return reward, state, done'''
'''Step method, return state, reward, done, info'''
raise NotImplementedError

@abstractmethod
Expand Down Expand Up @@ -192,5 +192,5 @@ def space_reset(self):

@lab_api
def space_step(self, action_e):
'''Space (multi-env) step method, return reward_e, state_e, done_e'''
'''Space (multi-env) step method, return state_e, reward_e, done_e, info_e'''
raise NotImplementedError
25 changes: 13 additions & 12 deletions slm_lab/env/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,14 @@ def reset(self):
def step(self, action):
if not self.is_discrete: # guard for continuous
action = np.array([action])
state, reward, done, _info = self.u_env.step(action)
state, reward, done, info = self.u_env.step(action)
reward *= self.reward_scale
if util.to_render():
self.u_env.render()
# if self.max_t is not None:
# done = done or self.clock.t > self.max_t
# done = done or self.clock.t > self.max_t
self.done = done
logger.debug(f'Env {self.e} step reward: {reward}, state: {state}, done: {done}')
return reward, state, done
logger.debug(f'Env {self.e} step state: {state}, reward: {reward}, done: {done}')
return state, reward, done, info

@lab_api
def close(self):
Expand All @@ -90,7 +89,7 @@ def space_init(self, env_space):

@lab_api
def space_reset(self):
_reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
state_e, _reward_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
for ab, body in util.ndenumerate_nonan(self.body_e):
state = self.u_env.reset()
state_e[ab] = state
Expand All @@ -104,18 +103,20 @@ def space_reset(self):
def space_step(self, action_e):
action = action_e[(0, 0)] # single body
if self.done: # space envs run continually without a central reset signal
return self.space_reset()
_reward_e, state_e, done_e = self.space_reset()
return state_e, _reward_e, done_e, None
if not self.is_discrete:
action = np.array([action])
state, reward, done, _info = self.u_env.step(action)
state, reward, done, info = self.u_env.step(action)
reward *= self.reward_scale
if util.to_render():
self.u_env.render()
self.done = done = done or self.clock.t > self.max_t
reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
state_e, reward_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
for ab, body in util.ndenumerate_nonan(self.body_e):
reward_e[ab] = reward
state_e[ab] = state
reward_e[ab] = reward
done_e[ab] = done
logger.debug(f'Env {self.e} step reward_e: {reward_e}, state_e: {state_e}, done_e: {done_e}')
return reward_e, state_e, done_e
info_e = info
logger.debug(f'Env {self.e} step state_e: {state_e}, reward_e: {reward_e}, done_e: {done_e}')
return state_e, reward_e, done_e, info_e
20 changes: 11 additions & 9 deletions slm_lab/env/unity.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,12 @@ def step(self, action):
env_info_dict = self.u_env.step(action)
a, b = 0, 0 # default singleton aeb
env_info_a = self._get_env_info(env_info_dict, a)
reward = env_info_a.rewards[b] * self.reward_scale
state = env_info_a.states[b]
reward = env_info_a.rewards[b] * self.reward_scale
done = env_info_a.local_done[b]
self.done = done = done or self.clock.t > self.max_t
logger.debug(f'Env {self.e} step reward: {reward}, state: {state}, done: {done}')
return reward, state, done
logger.debug(f'Env {self.e} step state: {state}, reward: {reward}, done: {done}')
return state, reward, done, env_info_a

@lab_api
def close(self):
Expand All @@ -167,7 +167,7 @@ def space_reset(self):
self._check_u_brain_to_agent()
self.done = False
env_info_dict = self.u_env.reset(train_mode=(util.get_lab_mode() != 'dev'), config=self.env_spec.get('unity'))
_reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
state_e, _reward_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
for (a, b), body in util.ndenumerate_nonan(self.body_e):
env_info_a = self._get_env_info(env_info_dict, a)
self._check_u_agent_to_body(env_info_a, a)
Expand All @@ -181,15 +181,17 @@ def space_reset(self):
def space_step(self, action_e):
# TODO implement clock_speed: step only if self.clock.to_step()
if self.done:
return self.space_reset()
_reward_e, state_e, done_e = self.space_reset()
return state_e, _reward_e, done_e, None
action_e = util.nanflatten(action_e)
env_info_dict = self.u_env.step(action_e)
reward_e, state_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
state_e, reward_e, done_e = self.env_space.aeb_space.init_data_s(ENV_DATA_NAMES, e=self.e)
for (a, b), body in util.ndenumerate_nonan(self.body_e):
env_info_a = self._get_env_info(env_info_dict, a)
reward_e[(a, b)] = env_info_a.rewards[b] * self.reward_scale
state_e[(a, b)] = env_info_a.states[b]
reward_e[(a, b)] = env_info_a.rewards[b] * self.reward_scale
done_e[(a, b)] = env_info_a.local_done[b]
info_e = env_info_dict
self.done = (util.nonan_all(done_e) or self.clock.t > self.max_t)
logger.debug(f'Env {self.e} step reward_e: {reward_e}, state_e: {state_e}, done_e: {done_e}')
return reward_e, state_e, done_e
logger.debug(f'Env {self.e} step state_e: {state_e}, reward_e: {reward_e}, done_e: {done_e}')
return state_e, reward_e, done_e, info_e
7 changes: 4 additions & 3 deletions slm_lab/experiment/control.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ def run_eval_episode(self):
while not done:
self.eval_env.clock.tick('t')
action = self.agent.act(state)
reward, state, done = self.eval_env.step(action)
next_state, reward, done, info = self.eval_env.step(action)
state = next_state
total_reward += reward
# exit eval context, restore variables simply by updating
self.agent.algorithm.update()
Expand Down Expand Up @@ -109,7 +110,7 @@ def run_episode(self):
fps = 0 if wall_t == 0 else total_t / wall_t
print(f'total_t: {total_t}, fps: {fps}')
vaction = np.asarray(vaction)
vreward, vnext_state, vdone = self.env.step(vaction)
vnext_state, vreward, vdone, info = self.env.step(vaction)
if vdone[0]:
reward_history.append(total_reward)
avg_reward = np.mean(reward_history)
Expand Down Expand Up @@ -196,7 +197,7 @@ def run_all_episodes(self):
self.try_ckpt(self.agent_space, self.env_space)
all_done = self.aeb_space.tick()
action_space = self.agent_space.act(state_space)
reward_space, next_state_space, done_space = self.env_space.step(action_space)
next_state_space, reward_space, done_space, info_v = self.env_space.step(action_space)
self.agent_space.update(state_space, action_space, reward_space, next_state_space, done_space)
state_space = next_state_space
self.try_ckpt(self.agent_space, self.env_space)
Expand Down

0 comments on commit 93b3efc

Please sign in to comment.