diff --git a/README.md b/README.md
index f766676..0eb8e19 100644
--- a/README.md
+++ b/README.md
@@ -1,23 +1,23 @@
# rlenvs
-Reinforcement learning environments for Torch7, inspired by RL-Glue [[1]](#references). Supported environments:
-
-- rlenvs.Acrobot [[2]](#references)
-- rlenvs.Atari (Arcade Learning Environment)\* [[3]](#references)
-- rlenvs.Blackjack [[4]](#references)
-- rlenvs.CartPole [[5]](#references)
-- rlenvs.Catch [[6]](#references)
-- rlenvs.CliffWalking [[7]](#references)
-- rlenvs.DynaMaze [[8]](#references)
-- rlenvs.GridWorld [[9]](#references)
-- rlenvs.JacksCarRental [[7]](#references)
-- rlenvs.Minecraft (Project Malmö)\* [[10]](#references)
-- rlenvs.MountainCar [[11]](#references)
-- rlenvs.MultiArmedBandit [[12, 13]](#references)
-- rlenvs.RandomWalk [[14]](#references)
-- rlenvs.Taxi [[15]](#references)
-- rlenvs.WindyWorld [[7]](#references)
-- rlenvs.XOWorld [[16]](#references)
+Reinforcement learning environments for Torch7, inspired by [RL-Glue](http://glue.rl-community.org/wiki/Main_Page) [[1]](#references) and conforming to the [OpenAI Gym API](https://gym.openai.com/docs) [[2]](#references). Supported environments:
+
+- rlenvs.Acrobot [[3]](#references)
+- rlenvs.Atari (Arcade Learning Environment)\* [[4]](#references)
+- rlenvs.Blackjack [[5]](#references)
+- rlenvs.CartPole [[6]](#references)
+- rlenvs.Catch [[7]](#references)
+- rlenvs.CliffWalking [[8]](#references)
+- rlenvs.DynaMaze [[9]](#references)
+- rlenvs.GridWorld [[10]](#references)
+- rlenvs.JacksCarRental [[8]](#references)
+- rlenvs.Minecraft (Project Malmö)\* [[11]](#references)
+- rlenvs.MountainCar [[12]](#references)
+- rlenvs.MultiArmedBandit [[13, 14]](#references)
+- rlenvs.RandomWalk [[15]](#references)
+- rlenvs.Taxi [[16]](#references)
+- rlenvs.WindyWorld [[8]](#references)
+- rlenvs.XOWorld [[17]](#references)
Run `th experiment.lua` (or `qlua experiment.lua`) to run a demo of a random agent playing Catch.
@@ -44,10 +44,11 @@ Requires a [supported](https://github.com/Kaixhin/Atari/blob/master/roms/README.
luarocks install luasocket
```
-Requires [Malmö](https://github.com/Microsoft/malmo) (includes Minecraft), extracted with directory name `MalmoPlatform`. `libMalmoLua.so` should be added to `LUA_CPATH`. For example, if `MalmoPlatform` is in your home directory, add the following to the end of your `~/.bashrc`:
+Requires [Malmö](https://github.com/Microsoft/malmo) (includes Minecraft), extracted with directory name `MalmoPlatform`. `libMalmoLua.so` should be added to `LUA_CPATH`, and the level schemas should be exported to `MALMO_XSD_PATH`. For example, if `MalmoPlatform` is in `/home/username`, add the following to the end of your `~/.bashrc`:
```sh
-export LUA_CPATH=~/MalmoPlatform/Torch_Examples/libMalmoLua.so;$LUA_CPATH
+export LUA_CPATH='/home/username/MalmoPlatform/Torch_Examples/?.so;'$LUA_CPATH
+export MALMO_XSD_PATH=/home/username/MalmoPlatform
```
The Malmö client (`launchClient.sh`) must be operating to run.
@@ -66,15 +67,21 @@ local observation = env:start()
**Note that the API is under development and may be subject to change**
+### rlenvs.envs
+
+A table of all environments available in `rlenvs`.
+
### observation = env:start([opts])
-Starts a new episode in the environment and returns the first `observation`. May take `opts`.
+Starts a new episode in the environment and returns the first `observation`. May take `opts`.
+Note that environments must actually implement this as `_start`.
### reward, observation, terminal, [actionTaken] = env:step(action)
-Performs a step in the environment using `action` (which may be a list - see below), and returns the `reward`, the `observation` of the state transitioned to, and a `terminal` flag. Optionally provides `actionTaken`, if the environment provides supervision in the form of the actual action taken by the agent in spite of the provided action.
+Performs a step in the environment using `action` (which may be a list - see below), and returns the `reward`, the `observation` of the state transitioned to, and a `terminal` flag. Optionally provides `actionTaken`, if the environment provides supervision in the form of the actual action taken by the agent in spite of the provided action.
+Note that environments must actually implement this as `_step`.
-### stateSpec = env:getStateSpec()
+### stateSpace = env:getStateSpace()
Returns a state specification as a list with 3 elements:
@@ -86,11 +93,11 @@ Returns a state specification as a list with 3 elements:
If several states are returned, `stateSpec` is itself a list of state specifications. Ranges may use `nil` if unknown.
-### actionSpec = env:getActionSpec()
+### actionSpace = env:getActionSpace()
Returns an action specification, with the same structure as used for state specifications.
-### minReward, maxReward = env:getRewardSpec()
+### minReward, maxReward = env:getRewardSpace()
Returns the minimum and maximum rewards produced by the environment. Values may be `nil` if unknown.
@@ -114,6 +121,10 @@ Returns an RGB display specification, with the same structure as used for state
Returns a RGB display tensor for visualising the state of the environment. Note that this may not be the same as the state provided for the agent.
+### env:render()
+
+Displays the environment using `image`. Requires the code to be run with `qlua` (rather than `th`) and `getDisplay` to be implemented by the environment.
+
## Development
Environments must inherit from `Env` and therefore implement the above methods (as well as a constructor). `experiment.lua` can be easily adapted for testing different environments. New environments should be added to `rlenvs/init.lua`, `rocks/rlenvs-scm-1.rockspec`, and be listed in this readme with an appropriate reference. For an example of a more complex environment that will only be installed if its optional dependencies are satisfied, see `rlenvs/Atari.lua`.
@@ -121,18 +132,19 @@ Environments must inherit from `Env` and therefore implement the above methods (
## References
[1] Tanner, B., & White, A. (2009). RL-Glue: Language-independent software for reinforcement-learning experiments. *The Journal of Machine Learning Research, 10*, 2133-2136.
-[2] DeJong, G., & Spong, M. W. (1994, June). Swinging up the acrobot: An example of intelligent control. In *American Control Conference, 1994* (Vol. 2, pp. 2158-2162). IEEE.
-[3] Bellemare, M. G., Naddaf, Y., Veness, J., & Bowling, M. (2012). The arcade learning environment. *J. Artificial Intelligence Res, 47*, 253-279.
-[4] Pérez-Uribe, A., & Sanchez, E. (1998, May). Blackjack as a test bed for learning strategies in neural networks. In *Neural Networks Proceedings, 1998. IEEE World Congress on Computational Intelligence. The 1998 IEEE International Joint Conference on* (Vol. 3, pp. 2022-2027). IEEE.
-[5] Barto, A. G., Sutton, R. S., & Anderson, C. W. (1983). Neuronlike adaptive elements that can solve difficult learning control problems. *Systems, Man and Cybernetics, IEEE Transactions on*, (5), 834-846.
-[6] Mnih, V., Heess, N., & Graves, A. (2014). Recurrent models of visual attention. In *Advances in Neural Information Processing Systems* (pp. 2204-2212).
-[7] Sutton, R. S., & Barto, A. G. (1998). *Reinforcement learning: An introduction* (Vol. 1, No. 1). Cambridge: MIT press.
-[8] Sutton, R. S. (1990). Integrated architectures for learning, planning, and reacting based on approximating dynamic programming. In *Proceedings of the seventh international conference on machine learning* (pp. 216-224).
-[9] Boyan, J., & Moore, A. W. (1995). Generalization in reinforcement learning: Safely approximating the value function. *Advances in neural information processing systems*, 369-376.
-[10] Johnson, M., Hofmann, K., Hutton, T., & Bignell, D. (2016). The Malmo platform for artificial intelligence experimentation. In *International joint conference on artificial intelligence (IJCAI)*.
-[11] Singh, S. P., & Sutton, R. S. (1996). Reinforcement learning with replacing eligibility traces. *Machine learning, 22*(1-3), 123-158.
-[12] Robbins, H. (1985). Some aspects of the sequential design of experiments. In *Herbert Robbins Selected Papers* (pp. 169-177). Springer New York.
-[13] Whittle, P. (1988). Restless bandits: Activity allocation in a changing world. *Journal of applied probability*, 287-298.
-[14] Sutton, R. S. (1988). Learning to predict by the methods of temporal differences. *Machine learning, 3*(1), 9-44.
-[15] Dietterich, T. G. (2000). Hierarchical Reinforcement Learning with the MAXQ Value Function Decomposition. In *Journal of Artificial Intelligence Research*.
-[16] Garnelo, M., Arulkumaran, K., & Shanahan, M. (2016). Towards Deep Symbolic Reinforcement Learning. *arXiv preprint arXiv:1609.05518*.
+[2] Brockman, G., Cheung, V., Pettersson, L., Schneider, J., Schulman, J., Tang, J., & Zaremba, W. (2016). OpenAI Gym. *arXiv preprint arXiv:1606.01540*.
+[3] DeJong, G., & Spong, M. W. (1994, June). Swinging up the acrobot: An example of intelligent control. In *American Control Conference, 1994* (Vol. 2, pp. 2158-2162). IEEE.
+[4] Bellemare, M. G., Naddaf, Y., Veness, J., & Bowling, M. (2012). The arcade learning environment. *Journal of Artificial Intelligence Research, 47*, 253-279.
+[5] Pérez-Uribe, A., & Sanchez, E. (1998, May). Blackjack as a test bed for learning strategies in neural networks. In *Neural Networks Proceedings, 1998. IEEE World Congress on Computational Intelligence. The 1998 IEEE International Joint Conference on* (Vol. 3, pp. 2022-2027). IEEE.
+[6] Barto, A. G., Sutton, R. S., & Anderson, C. W. (1983). Neuronlike adaptive elements that can solve difficult learning control problems. *Systems, Man and Cybernetics, IEEE Transactions on*, (5), 834-846.
+[7] Mnih, V., Heess, N., & Graves, A. (2014). Recurrent models of visual attention. In *Advances in Neural Information Processing Systems* (pp. 2204-2212).
+[8] Sutton, R. S., & Barto, A. G. (1998). *Reinforcement learning: An introduction* (Vol. 1, No. 1). Cambridge: MIT press.
+[9] Sutton, R. S. (1990). Integrated architectures for learning, planning, and reacting based on approximating dynamic programming. In *Proceedings of the Seventh International Conference on Machine Learning* (pp. 216-224).
+[10] Boyan, J., & Moore, A. W. (1995). Generalization in reinforcement learning: Safely approximating the value function. *Advances in Neural Information Processing Systems*, 369-376.
+[11] Johnson, M., Hofmann, K., Hutton, T., & Bignell, D. (2016). The Malmo platform for artificial intelligence experimentation. In *International Joint Conference on Artificial Intelligence*.
+[12] Singh, S. P., & Sutton, R. S. (1996). Reinforcement learning with replacing eligibility traces. *Machine Learning, 22*(1-3), 123-158.
+[13] Robbins, H. (1985). Some aspects of the sequential design of experiments. In *Herbert Robbins Selected Papers* (pp. 169-177). Springer New York.
+[14] Whittle, P. (1988). Restless bandits: Activity allocation in a changing world. *Journal of Applied probability*, 287-298.
+[15] Sutton, R. S. (1988). Learning to predict by the methods of temporal differences. *Machine Learning, 3*(1), 9-44.
+[16] Dietterich, T. G. (2000). Hierarchical Reinforcement Learning with the MAXQ Value Function Decomposition. In *Journal of Artificial Intelligence Research*.
+[17] Garnelo, M., Arulkumaran, K., & Shanahan, M. (2016). Towards Deep Symbolic Reinforcement Learning. In *Workshop on Deep Reinforcement Learning, NIPS 2016*.
diff --git a/experiment.lua b/experiment.lua
index a184662..fabd910 100644
--- a/experiment.lua
+++ b/experiment.lua
@@ -1,38 +1,31 @@
-local image = require 'image'
-local Catch = require 'rlenvs/Catch'
-
--- Detect QT for image display
-local qt = pcall(require, 'qt')
+local Catch = require 'rlenvs.Catch'
-- Initialise and start environment
-local env = Catch({level = 2})
-local stateSpec = env:getStateSpec()
-local actionSpec = env:getActionSpec()
+local env = Catch({level = 2, render = true, zoom = 10})
+local actionSpace = env:getActionSpace()
local observation = env:start()
-local reward, terminal
+local reward, terminal = 0, false
local episodes, totalReward = 0, 0
-local nSteps = 1000 * (stateSpec[2][2] - 1) -- Run for 1000 episodes
+local nEpisodes = 1000
-- Display
-local window = qt and image.display({image=observation, zoom=10})
+env:render()
-for i = 1, nSteps do
- -- Pick random action and execute it
- local action = torch.random(actionSpec[3][1], actionSpec[3][2])
- reward, observation, terminal = env:step(action)
- totalReward = totalReward + reward
+for i = 1, nEpisodes do
+ while not terminal do
+ -- Pick random action and execute it
+ local action = torch.random(0, actionSpace['n'] - 1)
+ reward, observation, terminal = env:step(action)
+ totalReward = totalReward + reward
- -- Display
- if qt then
- image.display({image=observation, zoom=10, win=window})
+ -- Display
+ env:render()
end
- -- If game finished, start again
- if terminal then
- episodes = episodes + 1
- observation = env:start()
- end
+ episodes = episodes + 1
+ observation = env:start()
+ terminal = false
end
print('Episodes: ' .. episodes)
print('Total Reward: ' .. totalReward)
diff --git a/rlenvs/Acrobot.lua b/rlenvs/Acrobot.lua
index 38af8ff..2d8d693 100644
--- a/rlenvs/Acrobot.lua
+++ b/rlenvs/Acrobot.lua
@@ -1,11 +1,14 @@
local classic = require 'classic'
local Acrobot, super = classic.class('Acrobot', Env)
+Acrobot.timeStepLimit = 500
-- Constructor
function Acrobot:_init(opts)
opts = opts or {}
-
+ opts.timeStepLimit = Acrobot.timeStepLimit
+ super._init(self, opts)
+
-- Constants
self.g = opts.g or 9.8
self.m1 = opts.m1 or 1 -- Mass of link 1
@@ -21,27 +24,40 @@ function Acrobot:_init(opts)
end
-- 4 states returned, of type 'real', of dimensionality 1, with differing ranges
-function Acrobot:getStateSpec()
- return {
- {'real', 1, {-math.pi, math.pi}}, -- Joint 1 angle
- {'real', 1, {-math.pi, math.pi}}, -- Joint 2 angle
- {'real', 1, {-4*math.pi, 4*math.pi}}, -- Joint 1 angular velocity
- {'real', 1, {-9*math.pi, 9*math.pi}} -- Joint 2 angular velocity
+function Acrobot:getStateSpace()
+ local state = {}
+ state['name'] = 'Box'
+ state['shape'] = {4}
+ state['low'] = {
+ -math.pi, -- Joint 1 angle
+ -math.pi, -- Joint 2 angle
+ -4 * math.pi, -- Joint 1 angular velocity
+ -9 * math.pi -- Joint 2 angular velocity
+ }
+ state['high'] = {
+ math.pi, -- Joint 1 angle
+ math.pi, -- Joint 2 angle
+ 4 * math.pi, -- Joint 1 angular velocity
+ 9 * math.pi -- Joint 2 angular velocity
}
+ return state
end
-- 1 action required, of type 'int', of dimensionality 1, with second torque joint in {-1, 0, 1}
-function Acrobot:getActionSpec()
- return {'int', 1, {-1, 1}}
+function Acrobot:getActionSpace()
+ local action = {}
+ action['name'] = 'Discrete'
+ action['n'] = 3
+ return action
end
-- Min and max reward
-function Acrobot:getRewardSpec()
+function Acrobot:getRewardSpace()
return -1, 0
end
-- Resets the cart
-function Acrobot:start()
+function Acrobot:_start()
-- Reset angles and velocities
self.q1 = 0 -- Joint 1 angle
self.q2 = 0 -- Joint 2 angle
@@ -52,20 +68,19 @@ function Acrobot:start()
end
-- Swings the pole via torque on second joint
-function Acrobot:step(action)
+function Acrobot:_step(action)
+ action = action - 1 -- rescale the action
local reward = -1
local terminal = false
for t = 1, self.steps do
-- Calculate motion of system
- local d1 = self.m1*math.pow(self.lc1, 2) + self.m2*(math.pow(self.l1, 2) + math.pow(self.lc2, 2) + 2*self.l1*self.lc2*math.cos(self.q2)) + self.I1 + self.I2
- local d2 = self.m2*(math.pow(self.lc2, 2) + self.l1*self.lc2*math.cos(self.q2)) + self.I2
- local phi2 = self.m2*self.lc2*self.g*math.cos(self.q1 + self.q2 - math.pi/2)
- local phi1 = -self.m2*self.l1*self.lc2*math.pow(self.q2Dot, 2)*math.sin(self.q2) - 2*self.m2*self.l1*self.lc2*self.q2Dot*self.q1Dot*math.sin(self.q2) +
- (self.m1*self.lc1 + self.m2*self.l1)*self.g*math.cos(self.q1 - math.pi/2) + phi2
- local q2DotDot = (action + d2/d1*phi1 - self.m2*self.l1*self.lc2*math.pow(self.q1Dot, 2)*math.sin(self.q2) - phi2) /
- (self.m2*math.pow(self.lc2, 2) + self.I2 - math.pow(d2, 2)/d1)
- local q1DotDot = -(d2/q2DotDot + phi1)/d1
+ local d1 = self.m1 * math.pow(self.lc1, 2) + self.m2 * (math.pow(self.l1, 2) + math.pow(self.lc2, 2) + 2 * self.l1 * self.lc2 * math.cos(self.q2)) + self.I1 + self.I2
+ local d2 = self.m2 * (math.pow(self.lc2, 2) + self.l1 * self.lc2 * math.cos(self.q2)) + self.I2
+ local phi2 = self.m2 * self.lc2 * self.g * math.cos(self.q1 + self.q2 - math.pi/2)
+ local phi1 = -self.m2 * self.l1 * self.lc2 * math.pow(self.q2Dot, 2) * math.sin(self.q2) - 2 * self.m2 * self.l1 * self.lc2 * self.q2Dot * self.q1Dot * math.sin(self.q2) + (self.m1 * self.lc1 + self.m2 * self.l1) * self.g * math.cos(self.q1 - math.pi / 2) + phi2
+ local q2DotDot = (action + d2 / d1 * phi1 - self.m2 * self.l1 * self.lc2 * math.pow(self.q1Dot, 2) * math.sin(self.q2) - phi2) / (self.m2 * math.pow(self.lc2, 2) + self.I2 - math.pow(d2, 2) / d1)
+ local q1DotDot = -(d2 / q2DotDot + phi1) / d1
-- Update state using Euler's method
self.q1Dot = self.q1Dot + self.tau * q1DotDot
@@ -86,13 +101,13 @@ function Acrobot:step(action)
self.q2 = math.pi - (self.q2 % -math.pi)
end
-- Limit velocities
- self.q1Dot = math.max(self.q1Dot, -4*math.pi)
- self.q1Dot = math.min(self.q1Dot, 4*math.pi)
- self.q2Dot = math.max(self.q2Dot, -9*math.pi)
- self.q2Dot = math.min(self.q2Dot, 9*math.pi)
+ self.q1Dot = math.max(self.q1Dot, -4 * math.pi)
+ self.q1Dot = math.min(self.q1Dot, 4 * math.pi)
+ self.q2Dot = math.max(self.q2Dot, -9 * math.pi)
+ self.q2Dot = math.min(self.q2Dot, 9 * math.pi)
-- Terminate if second joint's height is greater than height of first joint (relative to origin)
- local h = -self.l1*math.cos(self.q1) - self.l2*math.sin(math.pi/2 - self.q1 - self.q2)
+ local h = -self.l1 * math.cos(self.q1) - self.l2 * math.sin(math.pi / 2 - self.q1 - self.q2)
if h > self.l1 then
reward = 0
terminal = true
diff --git a/rlenvs/Atari.lua b/rlenvs/Atari.lua
index 83e1ea8..23cb39c 100644
--- a/rlenvs/Atari.lua
+++ b/rlenvs/Atari.lua
@@ -6,11 +6,15 @@ if not hasALEWrap then
end
local Atari, super = classic.class('Atari', Env)
+Atari.timeStepLimit = 100000
-- Constructor
function Atari:_init(opts)
-- Create ALEWrap options from opts
opts = opts or {}
+ opts.timeStepLimit = Atari.timeStepLimit
+ super._init(self, opts)
+
if opts.lifeLossTerminal == nil then
opts.lifeLossTerminal = true
end
@@ -44,13 +48,25 @@ function Atari:_init(opts)
end
-- 1 state returned, of type 'real', of dimensionality 3 x 210 x 160, between 0 and 1
-function Atari:getStateSpec()
- return {'real', {3, 210, 160}, {0, 1}}
+function Atari:getStateSpace()
+ local state = {}
+ state['name'] = 'Box'
+ state['shape'] = {3, 210, 160}
+ state['low'] = {
+ 0
+ }
+ state['high'] = {
+ 1
+ }
+ return state
end
-- 1 action required, of type 'int', of dimensionality 1, between 1 and 18 (max)
-function Atari:getActionSpec()
- return {'int', 1, {1, #self.actions}}
+function Atari:getActionSpace()
+ local action = {}
+ action['name'] = 'Discrete'
+ action['n'] = #self.actions
+ return action
end
-- RGB screen of height 210 and width 160
@@ -59,12 +75,12 @@ function Atari:getDisplaySpec()
end
-- Min and max reward (unknown)
-function Atari:getRewardSpec()
+function Atari:getRewardSpace()
return nil, nil
end
-- Starts a new game, possibly with a random number of no-ops
-function Atari:start()
+function Atari:_start()
local screen, reward, terminal
if self.gameEnv._random_starts > 0 then
@@ -77,7 +93,7 @@ function Atari:start()
end
-- Steps in a game
-function Atari:step(action)
+function Atari:_step(action)
-- Map action index to action for game
action = self.actions[action]
diff --git a/rlenvs/Blackjack.lua b/rlenvs/Blackjack.lua
index 7d436c4..b174455 100644
--- a/rlenvs/Blackjack.lua
+++ b/rlenvs/Blackjack.lua
@@ -7,31 +7,45 @@ local Blackjack, super = classic.class('Blackjack', Env)
function Blackjack:_init(opts)
opts = opts or {}
+ super._init(self, opts)
+
-- Create number-only suit
self.suit = torch.Tensor({2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 11})
end
-- 2 states returned, of type 'int', of dimensionality 1, for the player sum, dealer's showing card, and player-usable ace
-function Blackjack:getStateSpec()
- return {
- {'int', 1, {2, 20}},
- {'int', 1, {1, 10}},
- {'int', 1, {0, 1}}
+function Blackjack:getStateSpace()
+ local state = {}
+ state['name'] = 'Box'
+ state['shape'] = {3}
+ state['low'] = {
+ 2,
+ 1,
+ 0
+ }
+ state['high'] = {
+ 20,
+ 10,
+ 1
}
+ return state
end
-- 1 action required, of type 'int', of dimensionality 1, either stand or hit
-function Blackjack:getActionSpec()
- return {'int', 1, {0, 1}}
+function Blackjack:getActionSpace()
+ local action = {}
+ action['name'] = 'Discrete'
+ action['n'] = 2
+ return action
end
-- Min and max reward
-function Blackjack:getRewardSpec()
+function Blackjack:getRewardSpace()
return -1, 1
end
-- Draw 2 cards for player and dealer
-function Blackjack:start()
+function Blackjack:_start()
-- Shuffle deck
self.deck = torch.cat({self.suit, self.suit, self.suit, self.suit}, 1):index(1, torch.randperm(52):long())
@@ -51,7 +65,7 @@ function Blackjack:start()
end
-- Player stands or hits
-function Blackjack:step(action)
+function Blackjack:_step(action)
local reward = 0
local terminal = false
diff --git a/rlenvs/CartPole.lua b/rlenvs/CartPole.lua
index 5b1e49b..4a99613 100644
--- a/rlenvs/CartPole.lua
+++ b/rlenvs/CartPole.lua
@@ -1,11 +1,14 @@
local classic = require 'classic'
local CartPole, super = classic.class('CartPole', Env)
+CartPole.timeStepLimit = 200
-- Constructor
function CartPole:_init(opts)
opts = opts or {}
-
+ opts.timeStepLimit = CartPole.timeStepLimit
+ super._init(self, opts)
+
-- Constants
self.gravity = opts.gravity or 9.8
self.cartMass = opts.cartMass or 1.0
@@ -19,27 +22,40 @@ function CartPole:_init(opts)
end
-- 4 states returned, of type 'real', of dimensionality 1, with differing ranges
-function CartPole:getStateSpec()
- return {
- {'real', 1, {-2.4, 2.4}}, -- Cart position
- {'real', 1, {nil, nil}}, -- Cart velocity
- {'real', 1, {math.rad(-12), math.rad(12)}}, -- Pole angle
- {'real', 1, {nil, nil}} -- Pole angular velocity
+function CartPole:getStateSpace()
+ local state = {}
+ state['name'] = 'Box'
+ state['shape'] = {4}
+ state['low'] = {
+ -2.4, -- Cart position
+ math.huge, -- Cart velocity
+ math.rad(-12), -- Pole angle
+ math.huge -- Pole angular velocity
+ }
+ state['high'] = {
+ 2.4, -- Cart position
+ math.huge, -- Cart velocity
+ math.rad(12), -- Pole angle
+ math.huge -- Pole angular velocity
}
+ return state
end
-- 1 action required, of type 'int', of dimensionality 1, between 0 and 1 (left, right)
-function CartPole:getActionSpec()
- return {'int', 1, {0, 1}}
+function CartPole:getActionSpace()
+ local action = {}
+ action['name'] = 'Discrete'
+ action['n'] = 2
+ return action
end
-- Min and max reward
-function CartPole:getRewardSpec()
+function CartPole:getRewardSpace()
return -1, 0
end
-- Resets the cart
-function CartPole:start()
+function CartPole:_start()
-- Reset position, angle and velocities
self.x = 0 -- Cart position (m)
self.xDot = 0 -- Cart velocity
@@ -50,14 +66,14 @@ function CartPole:start()
end
-- Drives the cart
-function CartPole:step(action)
+function CartPole:_step(action)
-- Calculate acceleration
local force = action == 1 and self.forceMagnitude or -self.forceMagnitude
local cosTheta = math.cos(self.theta)
local sinTheta = math.sin(self.theta)
- local temp = (force + 0.5*self.poleMassLength * math.pow(self.thetaDot, 2) * sinTheta) / self.totalMass
- local thetaDotDot = (self.gravity * sinTheta - cosTheta * temp) / (0.5*self.poleLength * (4/3 - self.poleMass * math.pow(cosTheta, 2) / self.totalMass))
- local xDotDot = temp - 0.5*self.poleMassLength * thetaDotDot * cosTheta / self.totalMass
+ local temp = (force + 0.5 * self.poleMassLength * math.pow(self.thetaDot, 2) * sinTheta) / self.totalMass
+ local thetaDotDot = (self.gravity * sinTheta - cosTheta * temp) / (0.5 * self.poleLength * (4 / 3 - self.poleMass * math.pow(cosTheta, 2) / self.totalMass))
+ local xDotDot = temp - 0.5 * self.poleMassLength * thetaDotDot * cosTheta / self.totalMass
-- Update state using Euler's method
self.x = self.x + self.tau * self.xDot
@@ -66,10 +82,10 @@ function CartPole:step(action)
self.thetaDot = self.thetaDot + self.tau * thetaDotDot
-- Check failure (if cart reaches sides of track/pole tips too much)
- local reward = 0
+ local reward = 1
local terminal = false
if self.x < -2.4 or self.x > 2.4 or self.theta < math.rad(-12) or self.theta > math.rad(12) then
- reward = -1
+ reward = 0
terminal = true
end
diff --git a/rlenvs/Catch.lua b/rlenvs/Catch.lua
index 40ba990..4fc87e0 100644
--- a/rlenvs/Catch.lua
+++ b/rlenvs/Catch.lua
@@ -1,10 +1,12 @@
local classic = require 'classic'
+local Env = require 'rlenvs/Env'
local Catch, super = classic.class('Catch', Env)
-- Constructor
function Catch:_init(opts)
opts = opts or {}
+ super._init(self, opts)
-- Difficulty level
self.level = opts.level or 2
@@ -28,13 +30,25 @@ function Catch:_init(opts)
end
-- 1 state returned, of type 'int', of dimensionality 1 x self.size x self.size, between 0 and 1
-function Catch:getStateSpec()
- return {'int', {1, self.size, self.size}, {0, 1}}
+function Catch:getStateSpace()
+ local state = {}
+ state['name'] = 'Box'
+ state['shape'] = {1, self.size, self.size}
+ state['low'] = {
+ 0
+ }
+ state['high'] = {
+ 1
+ }
+ return state
end
-- 1 action required, of type 'int', of dimensionality 1, between 0 and 2
-function Catch:getActionSpec()
- return {'int', 1, {0, 2}}
+function Catch:getActionSpace()
+ local action = {}
+ action['name'] = 'Discrete'
+ action['n'] = 3
+ return action
end
-- RGB screen of size self.size x self.size
@@ -43,7 +57,7 @@ function Catch:getDisplaySpec()
end
-- Min and max reward
-function Catch:getRewardSpec()
+function Catch:getRewardSpace()
return 0, 1
end
@@ -64,7 +78,7 @@ function Catch:redraw()
end
-- Starts new game
-function Catch:start()
+function Catch:_start()
-- Reset player and ball
self.player.x = math.ceil(self.size / 2)
self.ball.x = torch.random(self.size)
@@ -80,7 +94,7 @@ function Catch:start()
end
-- Steps in a game
-function Catch:step(action)
+function Catch:_step(action)
-- Reward is 0 by default
local reward = 0
diff --git a/rlenvs/CliffWalking.lua b/rlenvs/CliffWalking.lua
index 41d029f..4200e4f 100644
--- a/rlenvs/CliffWalking.lua
+++ b/rlenvs/CliffWalking.lua
@@ -5,35 +5,48 @@ local CliffWalking, super = classic.class('CliffWalking', Env)
-- Constructor
function CliffWalking:_init(opts)
opts = opts or {}
+ super._init(self, opts)
end
-- 2 states returned, of type 'int', of dimensionality 1, where x is 1-12 and y is 1-4
-function CliffWalking:getStateSpec()
- return {
- {'int', 1, {1, 12}}, -- x
- {'int', 1, {1, 4}} -- y
+function CliffWalking:getStateSpace()
+ local state = {}
+ state['name'] = 'Box'
+ state['shape'] = {2}
+ state['low'] = {
+ 1, -- x
+ 1 -- y
}
+ state['high'] = {
+ 12, -- x
+ 4 -- y
+ }
+ return state
end
-- 1 action required, of type 'int', of dimensionality 1, between 1 and 4 (up|right|down|left)
-function CliffWalking:getActionSpec()
- return {'int', 1, {1, 4}}
+function CliffWalking:getActionSpace()
+ local action = {}
+ action['name'] = 'Discrete'
+ action['n'] = 4
+ return action
end
-- Min and max reward
-function CliffWalking:getRewardSpec()
+function CliffWalking:getRewardSpace()
return -100, -1
end
-- Reset position
-function CliffWalking:start()
+function CliffWalking:_start()
self.position = {1, 1}
return self.position
end
-- Move up, right, down or left
-function CliffWalking:step(action)
+function CliffWalking:_step(action)
+ action = action + 1 -- scale action
local reward = -1
local terminal = false
diff --git a/rlenvs/DynaMaze.lua b/rlenvs/DynaMaze.lua
index 42faaf0..c1b9ddb 100644
--- a/rlenvs/DynaMaze.lua
+++ b/rlenvs/DynaMaze.lua
@@ -5,6 +5,7 @@ local DynaMaze, super = classic.class('DynaMaze', Env)
-- Constructor
function DynaMaze:_init(opts)
opts = opts or {}
+ super._init(self, opts)
-- Set change: none|blocking|shortcut
self.change = opts.change or 'none'
@@ -27,25 +28,36 @@ function DynaMaze:_init(opts)
end
-- 2 states returned, of type 'int', of dimensionality 1, where x is 1-9 and y is 1-6
-function DynaMaze:getStateSpec()
- return {
- {'int', 1, {1, 9}}, -- x
- {'int', 1, {1, 6}} -- y
+function DynaMaze:getStateSpace()
+ local state = {}
+ state['name'] = 'Box'
+ state['shape'] = {2}
+ state['low'] = {
+ 1, -- x
+ 1 -- y
}
+ state['high'] = {
+ 9, -- x
+ 6 -- y
+ }
+ return state
end
-- 1 action required, of type 'int', of dimensionality 1, between 1 and 4
-function DynaMaze:getActionSpec()
- return {'int', 1, {1, 4}}
+function DynaMaze:getActionSpace()
+ local action = {}
+ action['name'] = 'Discrete'
+ action['n'] = 4
+ return action
end
-- Min and max reward
-function DynaMaze:getRewardSpec()
+function DynaMaze:getRewardSpace()
return 0, 1
end
-- Reset position
-function DynaMaze:start()
+function DynaMaze:_start()
if self.change == 'none' then
self.position = {1, 4}
else
@@ -56,7 +68,8 @@ function DynaMaze:start()
end
-- Move up, right, down or left
-function DynaMaze:step(action)
+function DynaMaze:_step(action)
+ action = action + 1 -- scale action
local reward = 0
local terminal = false
diff --git a/rlenvs/Env.lua b/rlenvs/Env.lua
index 262c5f4..824a31a 100644
--- a/rlenvs/Env.lua
+++ b/rlenvs/Env.lua
@@ -3,10 +3,58 @@ local classic = require 'classic'
local Env = classic.class('Env')
-- Denote interfaces
-Env:mustHave('start')
-Env:mustHave('step')
-Env:mustHave('getStateSpec')
-Env:mustHave('getActionSpec')
-Env:mustHave('getRewardSpec')
+Env:mustHave('_start')
+Env:mustHave('_step')
+Env:mustHave('getStateSpace')
+Env:mustHave('getActionSpace')
+Env:mustHave('getRewardSpace')
+
+function Env:_init(opts)
+ -- Set max number of steps per episode (default 1000)
+ if opts.timeStepLimit and opts.maxSteps then
+ self.maxSteps = math.min(opts.timeStepLimit, opts.maxSteps)
+ elseif opts.maxSteps then
+ self.maxSteps = opts.maxSteps
+ elseif opts.timeStepLimit then
+ self.maxSteps = opts.timeStepLimit
+ else
+ self.maxSteps = 1000
+ end
+ self.currentStep = 1
+
+ if opts.render then
+ require 'image'
+ self.qt = pcall(require, 'qt')
+ if not self.qt then
+ print('Was not able to load qt to render, are you using qlua to run the script?')
+ end
+ self.zoom = opts.zoom or 1
+ end
+end
+
+function Env:step(action)
+ local reward, state, terminal = self:_step(action)
+
+ if self.currentStep == self.maxSteps then
+ terminal = true
+ self.currentStep = 0
+ end
+ self.currentStep = self.currentStep + 1
+
+ return reward, state, terminal
+end
+
+function Env:start()
+ self.currentStep = 1
+ local state = self:_start()
+ return state
+end
+
+function Env:render()
+ if self.qt and self.getDisplay then
+ self.window = self.window == nil and image.display({image = self:getDisplay(), zoom = self.zoom}) or self.window
+ image.display({image = self:getDisplay(), zoom = self.zoom, win = self.window})
+ end
+end
return Env
diff --git a/rlenvs/GridWorld.lua b/rlenvs/GridWorld.lua
index 66fba69..8bbf2b2 100644
--- a/rlenvs/GridWorld.lua
+++ b/rlenvs/GridWorld.lua
@@ -6,6 +6,7 @@ local GridWorld, super = classic.class('GridWorld', Env)
-- Constructor
function GridWorld:_init(opts)
opts = opts or {}
+ super._init(self, opts)
-- Cost of moving in world (discretized)
self.world = torch.Tensor(101, 101):fill(-0.5)
@@ -13,38 +14,50 @@ function GridWorld:_init(opts)
-- PuddleWorld
if opts.puddles then
-- Create 2D Gaussians to subtract from world
- self.world[{{30, 90}, {30, 50}}]:csub(image.gaussian({width=21, height=61}))
- self.world[{{60, 80}, {1, 50}}]:csub(image.gaussian({width=60, height=21})[{{}, {11, 60}}])
+ self.world[{{30, 90}, {30, 50}}]:csub(image.gaussian({width = 21, height = 61}))
+ self.world[{{60, 80}, {1, 50}}]:csub(image.gaussian({width = 60, height = 21})[{{}, {11, 60}}])
end
end
-- 2 states returned, of type 'real', of dimensionality 1, from 0-1
-function GridWorld:getStateSpec()
- return {
- {'real', 1, {0, 1}}, -- x
- {'real', 1, {0, 1}} -- y
+function GridWorld:getStateSpace()
+ local state = {}
+ state['name'] = 'Box'
+ state['shape'] = {2}
+ state['low'] = {
+ 0, -- x
+ 0 -- y
}
+ state['high'] = {
+ 1, -- x
+ 1 -- y
+ }
+ return state
end
-- 1 action required, of type 'int', of dimensionality 1, between 1 and 4
-function GridWorld:getActionSpec()
- return {'int', 1, {1, 4}}
+function GridWorld:getActionSpace()
+ local action = {}
+ action['name'] = 'Discrete'
+ action['n'] = 4
+ return action
end
-- Min and max reward
-function GridWorld:getRewardSpec()
+function GridWorld:getRewardSpace()
return torch.min(self.world), 0
end
-- Reset position
-function GridWorld:start()
+function GridWorld:_start()
self.position = {0.2, 0.4}
return self.position
end
-- Move up, right, down or left
-function GridWorld:step(action)
+function GridWorld:_step(action)
+ action = action + 1 -- scale action
local terminal = false
-- Move
@@ -63,7 +76,7 @@ function GridWorld:step(action)
end
-- Look up cost of moving to position
- local reward = self.world[{{self.position[1]*100+1}, {self.position[2]*100+1}}][1][1]
+ local reward = self.world[{{self.position[1] * 100 + 1}, {self.position[2] * 100 + 1}}][1][1]
-- Check if reached goal
if self.position[1] == 1 and self.position[2] == 1 then
diff --git a/rlenvs/JacksCarRental.lua b/rlenvs/JacksCarRental.lua
index fa9d3a3..a21719c 100644
--- a/rlenvs/JacksCarRental.lua
+++ b/rlenvs/JacksCarRental.lua
@@ -19,28 +19,40 @@ end
-- Constructor
function JacksCarRental:_init(opts)
opts = opts or {}
+ super._init(self, opts)
end
-- 2 states returned, of type 'int', of dimensionality 1, for 0-20 cars
-function JacksCarRental:getStateSpec()
- return {
- {'int', 1, {0, 20}}, -- Lot 1
- {'int', 1, {0, 20}} -- Lot 2
+function JacksCarRental:getStateSpace()
+ local state = {}
+ state['name'] = 'Box'
+ state['shape'] = {2}
+ state['low'] = {
+ 0, -- Lot 1
+ 0 -- Lot 2
}
+ state['high'] = {
+ 20, -- Lot 1
+ 20 -- Lot 2
+ }
+ return state
end
-- 1 action required, of type 'int', of dimensionality 1, between -5 and 5 (max 5 cars can be moved overnight)
-function JacksCarRental:getActionSpec()
- return {'int', 1, {-5, 5}} -- Negative numbers indicate transferring cars from lot 2 to lot 1
+function JacksCarRental:getActionSpace()
+ local action = {}
+ action['name'] = 'Discrete'
+ action['n'] = 10
+ return action
end
-- Min and max reward
-function JacksCarRental:getRewardSpec()
+function JacksCarRental:getRewardSpace()
return 0, 200
end
-- Resets the cars to 10 at each lot
-function JacksCarRental:start()
+function JacksCarRental:_start()
self.lot1 = 10
self.lot2 = 10
@@ -48,18 +60,19 @@ function JacksCarRental:start()
end
-- Acts out a day and night for Jack's Car Rental
-function JacksCarRental:step(action)
+function JacksCarRental:_step(action)
+ action = action - 5 -- scale action
local reward = 0 -- Reward in $
-- Customers rent cars from lot 1 during the day
local lot1Rentals = math.min(poisson(3), self.lot1)
self.lot1 = self.lot1 - lot1Rentals
- reward = reward + 10*lot1Rentals
+ reward = reward + 10 * lot1Rentals
-- Customers rent cars from lot 2 during the day
local lot2Rentals = math.min(poisson(4), self.lot2)
self.lot2 = self.lot2 - lot2Rentals
- reward = reward + 10*lot2Rentals
+ reward = reward + 10 * lot2Rentals
-- Customers return cars to lot 1 at the end of the day
local lot1Returns = poisson(3)
@@ -77,14 +90,14 @@ function JacksCarRental:step(action)
-- Move cars
self.lot1 = self.lot1 - carsMoved
self.lot2 = self.lot2 + carsMoved
- reward = reward - 2*carsMoved
- elseif action < 0 then
+ reward = reward - 2 * carsMoved
+ elseif action < 0 then -- Negative numbers indicate transferring cars from lot 2 to lot 1
carsMoved = math.min(-action, self.lot2)
carsMoved = math.min(carsMoved, 20 - self.lot1)
-- Move cars
self.lot2 = self.lot2 - carsMoved
self.lot1 = self.lot1 + carsMoved
- reward = reward - 2*carsMoved
+ reward = reward - 2 * carsMoved
end
return reward, {self.lot1, self.lot2}, false
diff --git a/rlenvs/Minecraft.lua b/rlenvs/Minecraft.lua
index 8997462..f31d850 100644
--- a/rlenvs/Minecraft.lua
+++ b/rlenvs/Minecraft.lua
@@ -16,12 +16,14 @@ end
-- Constructor
function Minecraft:_init(opts)
+ opts = opts or {}
+ super._init(self, opts)
-- Check libaMalmoLua is available locally
if not hasLibMalmoLua then
- print("Requires libMalmoLua.so in LUA_CPATH")
+ print("Requires libMalmoLua.so")
os.exit()
end
-
+
opts = opts or {}
self.height = opts.height or 84
self.width = opts.width or 84
@@ -33,14 +35,14 @@ function Minecraft:_init(opts)
-
-
- clear
- false
-
+
+
+ clear
+ false
+
@@ -69,7 +71,7 @@ function Minecraft:_init(opts)
1
-
+
@@ -77,12 +79,12 @@ function Minecraft:_init(opts)
James Bond
-
+
- 160
- 160
+ 320
+ 240
@@ -90,10 +92,11 @@ function Minecraft:_init(opts)
attack
-
-
-
-
+
+
+
+
+
@@ -120,16 +123,28 @@ function Minecraft:_init(opts)
end
-- 2 states returned, of type 'real', of dimensionality 1, from 0-1
-function Minecraft:getStateSpec()
- return {'real', {3, self.height, self.width}, {0, 1}}
+function Minecraft:getStateSpace()
+ local state = {}
+ state['name'] = 'Box'
+ state['shape'] = {3, self.height, self.width}
+ state['low'] = {
+ 0
+ }
+ state['high'] = {
+ 1
+ }
+ return state
end
-function Minecraft:getActionSpec()
- return {'int', 1, {1, #self.actions}}
+function Minecraft:getActionSpace()
+ local action = {}
+ action['name'] = 'Discrete'
+ action['n'] = #self.actions
+ return action
end
--- Min and max reward
-function Minecraft:getRewardSpec()
+-- Min and max reward (unknown)
+function Minecraft:getRewardSpace()
return nil, nil
end
@@ -157,8 +172,16 @@ function Minecraft:getRewards(world_rewards)
return proc_rewards
end
--- Reset position
-function Minecraft:start()
+-- Start new mission
+function Minecraft:_start()
+ local world_state = self.agent_host:getWorldState()
+
+ -- check if a previous mission is still running before starting a new one
+ if world_state.is_mission_running then
+ self.agent_host:sendCommand("quit")
+ sleep(0.5)
+ end
+
local mission = MissionSpec(self.mission_xml, true)
local mission_record = MissionRecordSpec()
@@ -173,9 +196,6 @@ function Minecraft:start()
assert(channels == 3, "No RGB video output")
assert(height == self.height or width == self.width, "Video output dimensions don't match those requested")
- -- Set the time limit for mission (in seconds)
- mission:timeLimitInSeconds(self.time_limit)
-
local status, err = pcall(function() self.agent_host:startMission( mission, mission_record ) end)
if not status then
print("Error starting mission: "..err)
@@ -216,7 +236,7 @@ function Minecraft:start()
end
-- Move up, right, down or left
-function Minecraft:step(action)
+function Minecraft:_step(action)
-- Do something
local action = self.actions[action]
self.agent_host:sendCommand(action)
@@ -247,7 +267,7 @@ function Minecraft:step(action)
self.proc_frames = self:processFrames(world_state.video_frames)
end
- local terminal = world_state.is_mission_running
+ local terminal = not world_state.is_mission_running
sleep(0.1)
diff --git a/rlenvs/MountainCar.lua b/rlenvs/MountainCar.lua
index 8d62109..26daec2 100644
--- a/rlenvs/MountainCar.lua
+++ b/rlenvs/MountainCar.lua
@@ -1,32 +1,47 @@
local classic = require 'classic'
local MountainCar, super = classic.class('MountainCar', Env)
+MountainCar.timeStepLimit = 200
-- Constructor
function MountainCar:_init(opts)
opts = opts or {}
+ opts.timeStepLimit = MountainCar.timeStepLimit
+
+ super._init(self, opts)
end
-- 2 states returned, of type 'real', of dimensionality 1, with differing ranges
-function MountainCar:getStateSpec()
- return {
- {'real', 1, {-0.07, 0.07}}, -- Velocity
- {'real', 1, {-1.2, 0.6}} -- Position
+function MountainCar:getStateSpace()
+ local state = {}
+ state['name'] = 'Box'
+ state['shape'] = {2}
+ state['low'] = {
+ -0.07, -- Velocity
+ -1.2 -- Position
+ }
+ state['high'] = {
+ 0.07, -- Velocity
+ 0.6 -- Position
}
+ return state
end
-- 1 action required, of type 'int', of dimensionality 1, between -1 and 1 (left, neutral, right)
-function MountainCar:getActionSpec()
- return {'int', 1, {-1, 1}}
+function MountainCar:getActionSpace()
+ local action = {}
+ action['name'] = 'Discrete'
+ action['n'] = 3
+ return action
end
-- Min and max reward
-function MountainCar:getRewardSpec()
+function MountainCar:getRewardSpace()
return -2, 0 -- As height = sin(3x) is between -1 and 1, and reward = height - 1
end
-- Resets the car
-function MountainCar:start()
+function MountainCar:_start()
-- Reset position and velocity
self.position = -0.5
self.velocity = 0
@@ -35,12 +50,13 @@ function MountainCar:start()
end
-- Drives the car
-function MountainCar:step(action)
+function MountainCar:_step(action)
+ action = action - 1 -- scale action
-- Calculate height
local height = math.sin(3*self.position)
-- Update velocity and position
- self.velocity = self.velocity + 0.001*action - 0.0025*math.cos(3*self.position)
+ self.velocity = self.velocity + 0.001 * action - 0.0025 * math.cos(3 * self.position)
self.velocity = math.max(self.velocity, -0.07)
self.velocity = math.min(self.velocity, 0.07)
self.position = self.position + self.velocity
diff --git a/rlenvs/MultiArmedBandit.lua b/rlenvs/MultiArmedBandit.lua
index 9261221..d4e6c75 100644
--- a/rlenvs/MultiArmedBandit.lua
+++ b/rlenvs/MultiArmedBandit.lua
@@ -5,6 +5,7 @@ local MultiArmedBandit, super = classic.class('MultiArmedBandit', Env)
-- Constructor
function MultiArmedBandit:_init(opts)
opts = opts or {}
+ super._init(self, opts)
-- Restless bandits (with a Gaussian random walk)
self.restless = opts.restless or false
@@ -19,27 +20,31 @@ function MultiArmedBandit:_init(opts)
end
-- No state (not a contextual bandit)
-function MultiArmedBandit:getStateSpec()
+function MultiArmedBandit:getStateSpace()
return nil
end
-- 1 action required, of type 'int', of dimensionality 1, of the number of arms
-function MultiArmedBandit:getActionSpec()
- return {'int', 1, {1, self.nArms}}
+function MultiArmedBandit:getActionSpace()
+ local action = {}
+ action['name'] = 'Discrete'
+ action['n'] = self.nArms
+ return action
end
-- Min and max rewards unknown when sampling from distributions
-function MultiArmedBandit:getRewardSpec()
+function MultiArmedBandit:getRewardSpace()
return nil, nil
end
-- Does nothing (distributions do not reset)
-function MultiArmedBandit:start()
+function MultiArmedBandit:_start()
return nil
end
-- Pulls an arm
-function MultiArmedBandit:step(action)
+function MultiArmedBandit:_step(action)
+ action = action + 1 -- scale action
-- Sample for reward
local reward = torch.normal(self.armMeans[action], 1)
diff --git a/rlenvs/RandomWalk.lua b/rlenvs/RandomWalk.lua
index 8594330..a1df9b2 100644
--- a/rlenvs/RandomWalk.lua
+++ b/rlenvs/RandomWalk.lua
@@ -5,32 +5,39 @@ local RandomWalk, super = classic.class('RandomWalk', Env)
-- Constructor
function RandomWalk:_init(opts)
opts = opts or {}
+ super._init(self, opts)
end
-- 1 states returned, of type 'int', of dimensionality 1, between 0 and 6 (the terminal states)
-function RandomWalk:getStateSpec()
- return {'int', 1, {0, 6}} -- Position
+function RandomWalk:getStateSpace()
+ local state = {}
+ state['name'] = 'Discrete'
+ state['n'] = 6
+ return state
end
-- 1 action required, of type 'int', of dimensionality 1, between 0 and 1 (left or right)
-function RandomWalk:getActionSpec()
- return {'int', 1, {0, 1}}
+function RandomWalk:getActionSpace()
+ local action = {}
+ action['name'] = 'Discrete'
+ action['n'] = 2
+ return action
end
-- Min and max reward
-function RandomWalk:getRewardSpec()
+function RandomWalk:getRewardSpace()
return 0, 1
end
-- Reset position
-function RandomWalk:start()
+function RandomWalk:_start()
self.position = 3
return self.position
end
-- Move left or right
-function RandomWalk:step(action)
+function RandomWalk:_step(action)
local reward = 0
local terminal = false
diff --git a/rlenvs/Taxi.lua b/rlenvs/Taxi.lua
index 61d0ba5..962b90c 100644
--- a/rlenvs/Taxi.lua
+++ b/rlenvs/Taxi.lua
@@ -12,10 +12,14 @@ local classic = require 'classic'
--]]
local Taxi, super = classic.class('Taxi', Env)
+Taxi.timeStepLimit = 200
-- Constructor
function Taxi:_init(opts)
opts = opts or {}
+ opts.timeStepLimit = Taxi.timeStepLimit
+
+ super._init(self, opts)
-- Passenger positions (Red, Green, Blue, Yellow)
self.rgbyPos = {{0, 4}, {4, 4}, {3, 0}, {0, 0}}
@@ -24,28 +28,42 @@ function Taxi:_init(opts)
end
-- 4 states returned, of type 'int', of dimensionality 1, where x and y are 0-5, fuel is -1-12, passenger position is 1-5 and destination is 1-4
-function Taxi:getStateSpec()
- return {
- {'int', 1, {0, 4}}, -- x
- {'int', 1, {0, 4}}, -- y
- {'int', 1, {-1, 12}}, -- Fuel
- {'int', 1, {1, 5}}, -- Passenger location
- {'int', 1, {1, 4}}, -- Destination TODO: Work out why there are apparently 5 destination states in the original paper
+function Taxi:getStateSpace()
+ local state = {}
+ state['name'] = 'Box'
+ state['shape'] = {5}
+ state['low'] = {
+ 0, -- x
+ 0, -- y
+ -1, -- Fuel
+ 1, -- Passenger location
+ 1 -- Destination TODO: Work out why there are apparently 5 destination states in the original paper
+ }
+ state['high'] = {
+ 4, -- x
+ 4, -- y
+ 12, -- Fuel
+ 5, -- Passenger location
+ 4 -- Destination
}
+ return state
end
-- 1 action required, of type 'int', of dimensionality 1, where 1-4 is move N, E, S, W, 5 is Pickup, 6 is Putdown and 7 is Fillup
-function Taxi:getActionSpec()
- return {'int', 1, {1, 7}}
+function Taxi:getActionSpace()
+ local action = {}
+ action['name'] = 'Discrete'
+ action['n'] = 7
+ return action
end
-- Min and max reward
-function Taxi:getRewardSpec()
+function Taxi:getRewardSpace()
return -20, 20
end
-- Reset position, fuel and passenger
-function Taxi:start()
+function Taxi:_start()
-- Randomise position and fuel
self.position = {torch.random(0, 4), torch.random(0, 4)}
self.fuel = torch.random(5, 12)
@@ -90,7 +108,7 @@ function Taxi:validMove(action)
end
-- Move up, right, down or left
-function Taxi:step(action)
+function Taxi:_step(action)
local reward = -1
local terminal = false
diff --git a/rlenvs/WindyWorld.lua b/rlenvs/WindyWorld.lua
index 9cba841..00fdce1 100644
--- a/rlenvs/WindyWorld.lua
+++ b/rlenvs/WindyWorld.lua
@@ -5,42 +5,56 @@ local WindyWorld, super = classic.class('WindyWorld', Env)
-- Constructor
function WindyWorld:_init(opts)
opts = opts or {}
+ super._init(self, opts)
-- Allow king's moves (8 directions)
self.king = opts.king or false
end
-- 2 states returned, of type 'int', of dimensionality 1, where x is 1-10 and y is 1-7
-function WindyWorld:getStateSpec()
- return {
- {'int', 1, {1, 10}}, -- x
- {'int', 1, {1, 7}} -- y
+function WindyWorld:getStateSpace()
+ local state = {}
+ state['name'] = 'Box'
+ state['shape'] = {5}
+ state['low'] = {
+ 1, -- x
+ 1 -- y
}
+ state['high'] = {
+ 10, -- x
+ 7 -- y
+ }
+ return state
end
-- 1 action required, of type 'int', of dimensionality 1, between 1 and 4 (for standard) or 1 and 8 (for king)
-function WindyWorld:getActionSpec()
+function WindyWorld:getActionSpace()
+ local action = {}
+ action['name'] = 'Discrete'
if self.king then
- return {'int', 1, {1, 8}}
+ action['n'] = 8
+ return action
else
- return {'int', 1, {1, 4}}
+ action['n'] = 4
+ return action
end
end
-- Min and max reward
-function WindyWorld:getRewardSpec()
+function WindyWorld:getRewardSpace()
return -1, -1
end
-- Reset position
-function WindyWorld:start()
+function WindyWorld:_start()
self.position = {1, 4}
return self.position
end
-- Move up, right, down or left
-function WindyWorld:step(action)
+function WindyWorld:_step(action)
+ action = action + 1 -- scale action
local terminal = false
-- Move
diff --git a/rlenvs/XOWorld.lua b/rlenvs/XOWorld.lua
index 691d61c..72f0ed9 100644
--- a/rlenvs/XOWorld.lua
+++ b/rlenvs/XOWorld.lua
@@ -6,6 +6,7 @@ local XOWorld, super = classic.class('XOWorld', Env)
-- Constructor
function XOWorld:_init(opts)
opts = opts or {}
+ super._init(self, opts)
-- Game mode (all circles, negative, or circles and crosses, negative and positive)
self.double = opts.double or false
@@ -102,24 +103,36 @@ function XOWorld:_init(opts)
{67, 7}, {67, 27}, {67, 47}, {67, 67}}
end
--- 1 state returned, of type 'int', of dimensionality 1 x self.size x self.size, between 0 and 1
-function XOWorld:getStateSpec()
- return {'int', {1, self.size, self.size}, {0, 1}}
+-- 1 state returned, of type 'real', of dimensionality 3 x 210 x 160, between 0 and 1
+function XOWorld:getStateSpace()
+ local state = {}
+ state['name'] = 'Box'
+ state['shape'] = {1, self.size, self.size}
+ state['low'] = {
+ 0
+ }
+ state['high'] = {
+ 1
+ }
+ return state
end
-- 1 action required, of type 'int', of dimensionality 1, between 0 and 3
-function XOWorld:getActionSpec()
- return {'int', 1, {0, 3}}
+function XOWorld:getActionSpace()
+ local action = {}
+ action['name'] = 'Discrete'
+ action['n'] = 4
+ return action
end
-- RGB screen of size self.size x self.size
function XOWorld:getDisplaySpec()
- return {'real', {3, self.size, self.size}, {0, 1}}
+ return {'real', {3, self.size, self.size}, {0, 1}}
end
-- Min and max reward
-function XOWorld:getRewardSpec()
- return -10, 10
+function XOWorld:getRewardSpace()
+ return -10, 10
end
-- Redraws screen based on state and performs collision detection
@@ -188,7 +201,7 @@ function XOWorld:update()
end
-- Starts new game
-function XOWorld:start()
+function XOWorld:_start()
-- Reset time
self.time = 1
@@ -257,7 +270,7 @@ function XOWorld:start()
end
-- Steps in a game
-function XOWorld:step(action)
+function XOWorld:_step(action)
-- Move player
if action == 0 then
self.x = math.max(self.x - 1, 1)
diff --git a/rlenvs/init.lua b/rlenvs/init.lua
index c29cfc9..ca94a8a 100644
--- a/rlenvs/init.lua
+++ b/rlenvs/init.lua
@@ -1,7 +1,7 @@
local rlenvs = {}
-- Include environments
-rlenvs.Env = require 'rlenvs/Env'
+Env = require 'rlenvs/Env'
rlenvs.Acrobot = require 'rlenvs/Acrobot'
rlenvs.Atari = require 'rlenvs/Atari'
rlenvs.Blackjack = require 'rlenvs/Blackjack'
@@ -17,6 +17,7 @@ rlenvs.MultiArmedBandit = require 'rlenvs/MultiArmedBandit'
rlenvs.RandomWalk = require 'rlenvs/RandomWalk'
rlenvs.Taxi = require 'rlenvs/Taxi'
rlenvs.WindyWorld = require 'rlenvs/WindyWorld'
+rlenvs.XOWorld = require 'rlenvs/XOWorld'
-- Remove nil environments
for k, v in pairs(rlenvs) do
@@ -25,4 +26,10 @@ for k, v in pairs(rlenvs) do
end
end
+local envs ={}
+for k,_ in pairs(rlenvs) do
+ envs[#envs+1]=k
+end
+rlenvs.envs = envs
+
return rlenvs
diff --git a/tests/test.lua b/tests/test.lua
new file mode 100644
index 0000000..32a36f2
--- /dev/null
+++ b/tests/test.lua
@@ -0,0 +1,41 @@
+require 'torch'
+local rlenvs = require 'rlenvs'
+
+local test = torch.TestSuite()
+local tester
+
+function test.envs()
+ for index, env in ipairs(rlenvs.envs) do
+ local function runTest()
+ local Env = require('rlenvs.' .. env)
+ -- Initialise and start environment
+ local env = Env()
+ local actionSpace = env:getActionSpace()
+ local observation = env:start()
+ -- Pick random action and execute it
+ local action = torch.random(0, actionSpace['n'] - 1)
+ local reward, observation, terminal = env:step(action)
+ -- Display if implemented
+ env:render()
+ end
+
+ if env == 'Atari' then
+ local hasALEWrap = pcall(require, 'alewrap')
+ if not hasALEWrap then
+ tester:assert(pcall(runTest), 'Failed to run rlenv environment ' .. env)
+ end
+ elseif env == 'Minecraft' then
+ local hasSocket = pcall(require, 'socket')
+ local hasLibMalmoLua = pcall(require, 'libMalmoLua')
+ if not hasSocket and hasLibMalmoLua then
+ tester:assert(pcall(runTest), 'Failed to run rlenv environment ' .. env)
+ end
+ else
+ tester:assert(pcall(runTest), 'Failed to run rlenv environment ' .. env)
+ end
+ end
+end
+
+tester = torch.Tester()
+tester:add(test)
+tester:run()
\ No newline at end of file