From 3e4f75e2f1260d09c15d9d2fae2b9497de5ecb68 Mon Sep 17 00:00:00 2001 From: Mr-Yellow Date: Fri, 2 Sep 2016 15:02:13 +1000 Subject: [PATCH 1/7] Allow overwritting action by environment --- Master.lua | 8 ++++++-- async/NStepQAgent.lua | 7 ++++++- async/OneStepQAgent.lua | 7 +++++-- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/Master.lua b/Master.lua index a8b07f5..c7bf9a8 100644 --- a/Master.lua +++ b/Master.lua @@ -77,7 +77,7 @@ function Master:train() -- Catch CTRL-C to save self:catchSigInt() - local reward, state, terminal = 0, self.env:start(), false + local reward, state, terminal = 0, taken, self.env:start(), false -- Set environment and agent to training mode self.env:training() @@ -97,7 +97,11 @@ function Master:train() local action = self.agent:observe(reward, state, terminal) -- As results received, learn in training mode if not terminal then -- Act on environment (to cause transition) - reward, state, terminal = self.env:step(action) + reward, state, terminal, taken = self.env:step(action) + -- Update experience memory with actual action + if taken and taken ~= action then + self.agent.memory.actions[self.agent.memory.index] = taken + end -- Track score episodeScore = episodeScore + reward else diff --git a/async/NStepQAgent.lua b/async/NStepQAgent.lua index 520de07..23bfad4 100644 --- a/async/NStepQAgent.lua +++ b/async/NStepQAgent.lua @@ -32,6 +32,8 @@ function NStepQAgent:learn(steps, from) log.info('NStepQAgent starting | steps=%d | ε=%.2f -> %.2f', steps, self.epsilon, self.epsilonEnd) local reward, terminal, state = self:start() + local taken + self.states:resize(self.batchSize, table.unpack(state:size():totable())) self.tic = torch.tic() repeat @@ -44,7 +46,10 @@ function NStepQAgent:learn(steps, from) local action = self:eGreedy(state, self.policyNet_) self.actions[self.batchIdx] = action - reward, terminal, state = self:takeAction(action) + reward, terminal, state, taken = self:takeAction(action) + if taken and taken ~= action then + self.actions[self.batchIdx] = taken + end self.rewards[self.batchIdx] = reward self:progress(steps) diff --git a/async/OneStepQAgent.lua b/async/OneStepQAgent.lua index 4a75416..675cd6b 100644 --- a/async/OneStepQAgent.lua +++ b/async/OneStepQAgent.lua @@ -24,13 +24,16 @@ function OneStepQAgent:learn(steps, from) log.info('%s starting | steps=%d | ε=%.2f -> %.2f', self.agentName, steps, self.epsilon, self.epsilonEnd) local reward, terminal, state = self:start() - local action, state_ + local action, state_, taken self.tic = torch.tic() for step1=1,steps do if not terminal then action = self:eGreedy(state, self.policyNet) - reward, terminal, state_ = self:takeAction(action) + reward, terminal, state_, taken = self:takeAction(action) + if taken and taken ~= action then + action = taken + end else reward, terminal, state_ = self:start() end From 20308c89b9701d472fa3c3090726c398329828e0 Mon Sep 17 00:00:00 2001 From: Mr-Yellow Date: Fri, 2 Sep 2016 15:07:04 +1000 Subject: [PATCH 2/7] Pass taken from environment through AsyncAgent --- async/AsyncAgent.lua | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/async/AsyncAgent.lua b/async/AsyncAgent.lua index 39ddb71..6cf3bff 100644 --- a/async/AsyncAgent.lua +++ b/async/AsyncAgent.lua @@ -78,7 +78,7 @@ end function AsyncAgent:takeAction(action) - local reward, rawObservation, terminal = self.env:step(action - self.actionOffset) + local reward, rawObservation, terminal, taken = self.env:step(action - self.actionOffset) if self.rewardClip > 0 then reward = math.max(reward, -self.rewardClip) reward = math.min(reward, self.rewardClip) @@ -91,7 +91,7 @@ function AsyncAgent:takeAction(action) self.stateBuffer:push(observation) end - return reward, terminal, self.stateBuffer:readAll() + return reward, terminal, self.stateBuffer:readAll(), taken end From a27e66d226f25f775de9bb2bfbbd31bd71b32d06 Mon Sep 17 00:00:00 2001 From: Mr-Yellow Date: Sun, 4 Sep 2016 07:40:44 +1000 Subject: [PATCH 3/7] Refactor to actionTaken and include actionOffset --- Master.lua | 8 ++++---- async/A3CAgent.lua | 9 +++++++-- async/AsyncAgent.lua | 4 ++-- async/NStepQAgent.lua | 8 ++++---- async/OneStepQAgent.lua | 8 ++++---- 5 files changed, 21 insertions(+), 16 deletions(-) diff --git a/Master.lua b/Master.lua index c7bf9a8..b1f96eb 100644 --- a/Master.lua +++ b/Master.lua @@ -77,7 +77,7 @@ function Master:train() -- Catch CTRL-C to save self:catchSigInt() - local reward, state, terminal = 0, taken, self.env:start(), false + local reward, state, terminal, actionTaken = 0, self.env:start(), false, false -- Set environment and agent to training mode self.env:training() @@ -97,10 +97,10 @@ function Master:train() local action = self.agent:observe(reward, state, terminal) -- As results received, learn in training mode if not terminal then -- Act on environment (to cause transition) - reward, state, terminal, taken = self.env:step(action) + reward, state, terminal, actionTaken = self.env:step(action) -- Update experience memory with actual action - if taken and taken ~= action then - self.agent.memory.actions[self.agent.memory.index] = taken + if actionTaken and actionTaken ~= action then + self.agent.memory.actions[self.agent.memory.index] = actionTaken end -- Track score episodeScore = episodeScore + reward diff --git a/async/A3CAgent.lua b/async/A3CAgent.lua index c507efe..48bb5b8 100644 --- a/async/A3CAgent.lua +++ b/async/A3CAgent.lua @@ -40,6 +40,8 @@ function A3CAgent:learn(steps, from) log.info('A3CAgent starting | steps=%d', steps) local reward, terminal, state = self:start() + local actionTaken + self.states:resize(self.batchSize, table.unpack(state:size():totable())) self.tic = torch.tic() @@ -55,7 +57,10 @@ function A3CAgent:learn(steps, from) self.actions[self.batchIdx] = action - reward, terminal, state = self:takeAction(action) + reward, terminal, state, actionTaken = self:takeAction(action) + if actionTaken and actionTaken + self.actionOffset ~= action then + action = actionTaken + self.actionOffset + end self.rewards[self.batchIdx] = reward self:progress(steps) @@ -98,7 +103,7 @@ function A3CAgent:accumulateGradients(terminal, state) local gradEntropy = torch.log(probability) + 1 -- Add to target to improve exploration (prevent convergence to suboptimal deterministic policy) self.policyTarget:add(self.beta, gradEntropy) - + self.policyNet_:backward(self.states[i], self.targets) end end diff --git a/async/AsyncAgent.lua b/async/AsyncAgent.lua index 6cf3bff..abadb31 100644 --- a/async/AsyncAgent.lua +++ b/async/AsyncAgent.lua @@ -78,7 +78,7 @@ end function AsyncAgent:takeAction(action) - local reward, rawObservation, terminal, taken = self.env:step(action - self.actionOffset) + local reward, rawObservation, terminal, actionTaken = self.env:step(action - self.actionOffset) if self.rewardClip > 0 then reward = math.max(reward, -self.rewardClip) reward = math.min(reward, self.rewardClip) @@ -91,7 +91,7 @@ function AsyncAgent:takeAction(action) self.stateBuffer:push(observation) end - return reward, terminal, self.stateBuffer:readAll(), taken + return reward, terminal, self.stateBuffer:readAll(), actionTaken end diff --git a/async/NStepQAgent.lua b/async/NStepQAgent.lua index 23bfad4..211d9f7 100644 --- a/async/NStepQAgent.lua +++ b/async/NStepQAgent.lua @@ -32,7 +32,7 @@ function NStepQAgent:learn(steps, from) log.info('NStepQAgent starting | steps=%d | ε=%.2f -> %.2f', steps, self.epsilon, self.epsilonEnd) local reward, terminal, state = self:start() - local taken + local actionTaken self.states:resize(self.batchSize, table.unpack(state:size():totable())) self.tic = torch.tic() @@ -46,9 +46,9 @@ function NStepQAgent:learn(steps, from) local action = self:eGreedy(state, self.policyNet_) self.actions[self.batchIdx] = action - reward, terminal, state, taken = self:takeAction(action) - if taken and taken ~= action then - self.actions[self.batchIdx] = taken + reward, terminal, state, actionTaken = self:takeAction(action) + if actionTaken and actionTaken + self.actionOffset ~= action then + action = actionTaken + self.actionOffset end self.rewards[self.batchIdx] = reward diff --git a/async/OneStepQAgent.lua b/async/OneStepQAgent.lua index 675cd6b..cdbace6 100644 --- a/async/OneStepQAgent.lua +++ b/async/OneStepQAgent.lua @@ -24,15 +24,15 @@ function OneStepQAgent:learn(steps, from) log.info('%s starting | steps=%d | ε=%.2f -> %.2f', self.agentName, steps, self.epsilon, self.epsilonEnd) local reward, terminal, state = self:start() - local action, state_, taken + local action, state_, actionTaken self.tic = torch.tic() for step1=1,steps do if not terminal then action = self:eGreedy(state, self.policyNet) - reward, terminal, state_, taken = self:takeAction(action) - if taken and taken ~= action then - action = taken + reward, terminal, state_, actionTaken = self:takeAction(action) + if actionTaken and actionTaken + self.actionOffset ~= action then + action = actionTaken + self.actionOffset end else reward, terminal, state_ = self:start() From 741f03fa5b1ad2280160ef112a0b2235a3d6d1ac Mon Sep 17 00:00:00 2001 From: Mr-Yellow Date: Sun, 4 Sep 2016 07:46:07 +1000 Subject: [PATCH 4/7] Document actionTaken in custom section --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index a1080a0..ec81365 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,8 @@ You can use a custom environment (as the path to a Lua file/`rlenvs`-namespaced If the environment has separate behaviour during training and testing it should also implement `training` and `evaluate` methods - otherwise these will be added as empty methods during runtime. The environment can also implement a `getDisplay` method (with a mandatory `getDisplaySpec` method for determining screen size) which will be used for displaying the screen/computing saliency maps, where `getDisplay` must return a RGB (3D) tensor; this can also be utilised even if the state is not an image (although saliency can only be computed for states that are images). This **must** be implemented to have a visual display/computing saliency maps. The `-zoom` factor can be used to increase the size of small displays. +Custom environments can also control the action selection process, specifying the actual action taken when it differs from that selected by the network. This allows the agent to learn from hand-crafted behaviours, human experts or pre-planned sequences. To achieve this environments can optionally return `actionTaken` from the `step` method. i.e. `return reward, state, terminal[, actionTaken]`. + You can also use a custom model (body) with `-modelBody`, which replaces the usual DQN convolutional layers with a custom Torch neural network (as the path to a Lua file/`models`-namespaced environment). The class must include a `createBody` method which returns the custom neural network. The model will receive a stack of the previous states (as determined by `-histLen`), and must reshape them manually if needed. The DQN "heads" will then be constructed as normal, with `-hiddenSize` used to change the size of the fully connected layer if needed. For an example on a GridWorld environment, run `./run.sh demo-grid` - the demo also works with `qlua` and experience replay agents. The custom environment and network can be found in the [examples](https://github.com/Kaixhin/Atari/tree/master/examples) folder. From 10d6de31262d1254076e08ad2c393f939f10ce3e Mon Sep 17 00:00:00 2001 From: Mr-Yellow Date: Sun, 4 Sep 2016 09:52:15 +1000 Subject: [PATCH 5/7] Add and remove offset in takeAction only --- async/A3CAgent.lua | 4 ++-- async/AsyncAgent.lua | 3 +++ async/NStepQAgent.lua | 4 ++-- async/OneStepQAgent.lua | 4 ++-- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/async/A3CAgent.lua b/async/A3CAgent.lua index 48bb5b8..743c849 100644 --- a/async/A3CAgent.lua +++ b/async/A3CAgent.lua @@ -58,8 +58,8 @@ function A3CAgent:learn(steps, from) self.actions[self.batchIdx] = action reward, terminal, state, actionTaken = self:takeAction(action) - if actionTaken and actionTaken + self.actionOffset ~= action then - action = actionTaken + self.actionOffset + if actionTaken and actionTaken ~= action then + action = actionTaken end self.rewards[self.batchIdx] = reward diff --git a/async/AsyncAgent.lua b/async/AsyncAgent.lua index abadb31..5ff0d19 100644 --- a/async/AsyncAgent.lua +++ b/async/AsyncAgent.lua @@ -79,6 +79,9 @@ end function AsyncAgent:takeAction(action) local reward, rawObservation, terminal, actionTaken = self.env:step(action - self.actionOffset) + if actionTaken then + actionTaken = actionTaken + self.actionOffset + end if self.rewardClip > 0 then reward = math.max(reward, -self.rewardClip) reward = math.min(reward, self.rewardClip) diff --git a/async/NStepQAgent.lua b/async/NStepQAgent.lua index 211d9f7..06911d8 100644 --- a/async/NStepQAgent.lua +++ b/async/NStepQAgent.lua @@ -47,8 +47,8 @@ function NStepQAgent:learn(steps, from) self.actions[self.batchIdx] = action reward, terminal, state, actionTaken = self:takeAction(action) - if actionTaken and actionTaken + self.actionOffset ~= action then - action = actionTaken + self.actionOffset + if actionTaken and actionTaken ~= action then + action = actionTaken end self.rewards[self.batchIdx] = reward diff --git a/async/OneStepQAgent.lua b/async/OneStepQAgent.lua index cdbace6..7670f6a 100644 --- a/async/OneStepQAgent.lua +++ b/async/OneStepQAgent.lua @@ -31,8 +31,8 @@ function OneStepQAgent:learn(steps, from) if not terminal then action = self:eGreedy(state, self.policyNet) reward, terminal, state_, actionTaken = self:takeAction(action) - if actionTaken and actionTaken + self.actionOffset ~= action then - action = actionTaken + self.actionOffset + if actionTaken and actionTaken ~= action then + action = actionTaken end else reward, terminal, state_ = self:start() From e188e92c47b61f189780b6011ce5c80c6279bb3a Mon Sep 17 00:00:00 2001 From: Mr-Yellow Date: Sun, 4 Sep 2016 09:59:12 +1000 Subject: [PATCH 6/7] Reusing action variable and recording after takeAction --- Master.lua | 3 ++- async/A3CAgent.lua | 3 +-- async/NStepQAgent.lua | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Master.lua b/Master.lua index b1f96eb..74ccceb 100644 --- a/Master.lua +++ b/Master.lua @@ -100,7 +100,8 @@ function Master:train() reward, state, terminal, actionTaken = self.env:step(action) -- Update experience memory with actual action if actionTaken and actionTaken ~= action then - self.agent.memory.actions[self.agent.memory.index] = actionTaken + action = actionTaken + self.agent.memory.actions[self.agent.memory.index] = action end -- Track score episodeScore = episodeScore + reward diff --git a/async/A3CAgent.lua b/async/A3CAgent.lua index 743c849..ee269c8 100644 --- a/async/A3CAgent.lua +++ b/async/A3CAgent.lua @@ -55,12 +55,11 @@ function A3CAgent:learn(steps, from) local V, probability = table.unpack(self.policyNet_:forward(state)) local action = torch.multinomial(probability, 1):squeeze() - self.actions[self.batchIdx] = action - reward, terminal, state, actionTaken = self:takeAction(action) if actionTaken and actionTaken ~= action then action = actionTaken end + self.actions[self.batchIdx] = action self.rewards[self.batchIdx] = reward self:progress(steps) diff --git a/async/NStepQAgent.lua b/async/NStepQAgent.lua index 06911d8..353d4d3 100644 --- a/async/NStepQAgent.lua +++ b/async/NStepQAgent.lua @@ -50,6 +50,7 @@ function NStepQAgent:learn(steps, from) if actionTaken and actionTaken ~= action then action = actionTaken end + self.actions[self.batchIdx] = action self.rewards[self.batchIdx] = reward self:progress(steps) From 17a80e04435677c700d752699ce262388d18dbff Mon Sep 17 00:00:00 2001 From: Mr-Yellow Date: Sun, 4 Sep 2016 10:00:02 +1000 Subject: [PATCH 7/7] Only record action once --- async/NStepQAgent.lua | 1 - 1 file changed, 1 deletion(-) diff --git a/async/NStepQAgent.lua b/async/NStepQAgent.lua index 353d4d3..c4a2290 100644 --- a/async/NStepQAgent.lua +++ b/async/NStepQAgent.lua @@ -44,7 +44,6 @@ function NStepQAgent:learn(steps, from) self.states[self.batchIdx]:copy(state) local action = self:eGreedy(state, self.policyNet_) - self.actions[self.batchIdx] = action reward, terminal, state, actionTaken = self:takeAction(action) if actionTaken and actionTaken ~= action then