diff --git a/README.md b/README.md index f766676..0eb8e19 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,23 @@ # rlenvs -Reinforcement learning environments for Torch7, inspired by RL-Glue [[1]](#references). Supported environments: - -- rlenvs.Acrobot [[2]](#references) -- rlenvs.Atari (Arcade Learning Environment)\* [[3]](#references) -- rlenvs.Blackjack [[4]](#references) -- rlenvs.CartPole [[5]](#references) -- rlenvs.Catch [[6]](#references) -- rlenvs.CliffWalking [[7]](#references) -- rlenvs.DynaMaze [[8]](#references) -- rlenvs.GridWorld [[9]](#references) -- rlenvs.JacksCarRental [[7]](#references) -- rlenvs.Minecraft (Project Malmö)\* [[10]](#references) -- rlenvs.MountainCar [[11]](#references) -- rlenvs.MultiArmedBandit [[12, 13]](#references) -- rlenvs.RandomWalk [[14]](#references) -- rlenvs.Taxi [[15]](#references) -- rlenvs.WindyWorld [[7]](#references) -- rlenvs.XOWorld [[16]](#references) +Reinforcement learning environments for Torch7, inspired by [RL-Glue](http://glue.rl-community.org/wiki/Main_Page) [[1]](#references) and conforming to the [OpenAI Gym API](https://gym.openai.com/docs) [[2]](#references). Supported environments: + +- rlenvs.Acrobot [[3]](#references) +- rlenvs.Atari (Arcade Learning Environment)\* [[4]](#references) +- rlenvs.Blackjack [[5]](#references) +- rlenvs.CartPole [[6]](#references) +- rlenvs.Catch [[7]](#references) +- rlenvs.CliffWalking [[8]](#references) +- rlenvs.DynaMaze [[9]](#references) +- rlenvs.GridWorld [[10]](#references) +- rlenvs.JacksCarRental [[8]](#references) +- rlenvs.Minecraft (Project Malmö)\* [[11]](#references) +- rlenvs.MountainCar [[12]](#references) +- rlenvs.MultiArmedBandit [[13, 14]](#references) +- rlenvs.RandomWalk [[15]](#references) +- rlenvs.Taxi [[16]](#references) +- rlenvs.WindyWorld [[8]](#references) +- rlenvs.XOWorld [[17]](#references) Run `th experiment.lua` (or `qlua experiment.lua`) to run a demo of a random agent playing Catch. @@ -44,10 +44,11 @@ Requires a [supported](https://github.com/Kaixhin/Atari/blob/master/roms/README. luarocks install luasocket ``` -Requires [Malmö](https://github.com/Microsoft/malmo) (includes Minecraft), extracted with directory name `MalmoPlatform`. `libMalmoLua.so` should be added to `LUA_CPATH`. For example, if `MalmoPlatform` is in your home directory, add the following to the end of your `~/.bashrc`: +Requires [Malmö](https://github.com/Microsoft/malmo) (includes Minecraft), extracted with directory name `MalmoPlatform`. `libMalmoLua.so` should be added to `LUA_CPATH`, and the level schemas should be exported to `MALMO_XSD_PATH`. For example, if `MalmoPlatform` is in `/home/username`, add the following to the end of your `~/.bashrc`: ```sh -export LUA_CPATH=~/MalmoPlatform/Torch_Examples/libMalmoLua.so;$LUA_CPATH +export LUA_CPATH='/home/username/MalmoPlatform/Torch_Examples/?.so;'$LUA_CPATH +export MALMO_XSD_PATH=/home/username/MalmoPlatform ``` The Malmö client (`launchClient.sh`) must be operating to run. @@ -66,15 +67,21 @@ local observation = env:start() **Note that the API is under development and may be subject to change** +### rlenvs.envs + +A table of all environments available in `rlenvs`. + ### observation = env:start([opts]) -Starts a new episode in the environment and returns the first `observation`. May take `opts`. +Starts a new episode in the environment and returns the first `observation`. May take `opts`. +Note that environments must actually implement this as `_start`. ### reward, observation, terminal, [actionTaken] = env:step(action) -Performs a step in the environment using `action` (which may be a list - see below), and returns the `reward`, the `observation` of the state transitioned to, and a `terminal` flag. Optionally provides `actionTaken`, if the environment provides supervision in the form of the actual action taken by the agent in spite of the provided action. +Performs a step in the environment using `action` (which may be a list - see below), and returns the `reward`, the `observation` of the state transitioned to, and a `terminal` flag. Optionally provides `actionTaken`, if the environment provides supervision in the form of the actual action taken by the agent in spite of the provided action. +Note that environments must actually implement this as `_step`. -### stateSpec = env:getStateSpec() +### stateSpace = env:getStateSpace() Returns a state specification as a list with 3 elements: @@ -86,11 +93,11 @@ Returns a state specification as a list with 3 elements: If several states are returned, `stateSpec` is itself a list of state specifications. Ranges may use `nil` if unknown. -### actionSpec = env:getActionSpec() +### actionSpace = env:getActionSpace() Returns an action specification, with the same structure as used for state specifications. -### minReward, maxReward = env:getRewardSpec() +### minReward, maxReward = env:getRewardSpace() Returns the minimum and maximum rewards produced by the environment. Values may be `nil` if unknown. @@ -114,6 +121,10 @@ Returns an RGB display specification, with the same structure as used for state Returns a RGB display tensor for visualising the state of the environment. Note that this may not be the same as the state provided for the agent. +### env:render() + +Displays the environment using `image`. Requires the code to be run with `qlua` (rather than `th`) and `getDisplay` to be implemented by the environment. + ## Development Environments must inherit from `Env` and therefore implement the above methods (as well as a constructor). `experiment.lua` can be easily adapted for testing different environments. New environments should be added to `rlenvs/init.lua`, `rocks/rlenvs-scm-1.rockspec`, and be listed in this readme with an appropriate reference. For an example of a more complex environment that will only be installed if its optional dependencies are satisfied, see `rlenvs/Atari.lua`. @@ -121,18 +132,19 @@ Environments must inherit from `Env` and therefore implement the above methods ( ## References [1] Tanner, B., & White, A. (2009). RL-Glue: Language-independent software for reinforcement-learning experiments. *The Journal of Machine Learning Research, 10*, 2133-2136. -[2] DeJong, G., & Spong, M. W. (1994, June). Swinging up the acrobot: An example of intelligent control. In *American Control Conference, 1994* (Vol. 2, pp. 2158-2162). IEEE. -[3] Bellemare, M. G., Naddaf, Y., Veness, J., & Bowling, M. (2012). The arcade learning environment. *J. Artificial Intelligence Res, 47*, 253-279. -[4] Pérez-Uribe, A., & Sanchez, E. (1998, May). Blackjack as a test bed for learning strategies in neural networks. In *Neural Networks Proceedings, 1998. IEEE World Congress on Computational Intelligence. The 1998 IEEE International Joint Conference on* (Vol. 3, pp. 2022-2027). IEEE. -[5] Barto, A. G., Sutton, R. S., & Anderson, C. W. (1983). Neuronlike adaptive elements that can solve difficult learning control problems. *Systems, Man and Cybernetics, IEEE Transactions on*, (5), 834-846. -[6] Mnih, V., Heess, N., & Graves, A. (2014). Recurrent models of visual attention. In *Advances in Neural Information Processing Systems* (pp. 2204-2212). -[7] Sutton, R. S., & Barto, A. G. (1998). *Reinforcement learning: An introduction* (Vol. 1, No. 1). Cambridge: MIT press. -[8] Sutton, R. S. (1990). Integrated architectures for learning, planning, and reacting based on approximating dynamic programming. In *Proceedings of the seventh international conference on machine learning* (pp. 216-224). -[9] Boyan, J., & Moore, A. W. (1995). Generalization in reinforcement learning: Safely approximating the value function. *Advances in neural information processing systems*, 369-376. -[10] Johnson, M., Hofmann, K., Hutton, T., & Bignell, D. (2016). The Malmo platform for artificial intelligence experimentation. In *International joint conference on artificial intelligence (IJCAI)*. -[11] Singh, S. P., & Sutton, R. S. (1996). Reinforcement learning with replacing eligibility traces. *Machine learning, 22*(1-3), 123-158. -[12] Robbins, H. (1985). Some aspects of the sequential design of experiments. In *Herbert Robbins Selected Papers* (pp. 169-177). Springer New York. -[13] Whittle, P. (1988). Restless bandits: Activity allocation in a changing world. *Journal of applied probability*, 287-298. -[14] Sutton, R. S. (1988). Learning to predict by the methods of temporal differences. *Machine learning, 3*(1), 9-44. -[15] Dietterich, T. G. (2000). Hierarchical Reinforcement Learning with the MAXQ Value Function Decomposition. In *Journal of Artificial Intelligence Research*. -[16] Garnelo, M., Arulkumaran, K., & Shanahan, M. (2016). Towards Deep Symbolic Reinforcement Learning. *arXiv preprint arXiv:1609.05518*. +[2] Brockman, G., Cheung, V., Pettersson, L., Schneider, J., Schulman, J., Tang, J., & Zaremba, W. (2016). OpenAI Gym. *arXiv preprint arXiv:1606.01540*. +[3] DeJong, G., & Spong, M. W. (1994, June). Swinging up the acrobot: An example of intelligent control. In *American Control Conference, 1994* (Vol. 2, pp. 2158-2162). IEEE. +[4] Bellemare, M. G., Naddaf, Y., Veness, J., & Bowling, M. (2012). The arcade learning environment. *Journal of Artificial Intelligence Research, 47*, 253-279. +[5] Pérez-Uribe, A., & Sanchez, E. (1998, May). Blackjack as a test bed for learning strategies in neural networks. In *Neural Networks Proceedings, 1998. IEEE World Congress on Computational Intelligence. The 1998 IEEE International Joint Conference on* (Vol. 3, pp. 2022-2027). IEEE. +[6] Barto, A. G., Sutton, R. S., & Anderson, C. W. (1983). Neuronlike adaptive elements that can solve difficult learning control problems. *Systems, Man and Cybernetics, IEEE Transactions on*, (5), 834-846. +[7] Mnih, V., Heess, N., & Graves, A. (2014). Recurrent models of visual attention. In *Advances in Neural Information Processing Systems* (pp. 2204-2212). +[8] Sutton, R. S., & Barto, A. G. (1998). *Reinforcement learning: An introduction* (Vol. 1, No. 1). Cambridge: MIT press. +[9] Sutton, R. S. (1990). Integrated architectures for learning, planning, and reacting based on approximating dynamic programming. In *Proceedings of the Seventh International Conference on Machine Learning* (pp. 216-224). +[10] Boyan, J., & Moore, A. W. (1995). Generalization in reinforcement learning: Safely approximating the value function. *Advances in Neural Information Processing Systems*, 369-376. +[11] Johnson, M., Hofmann, K., Hutton, T., & Bignell, D. (2016). The Malmo platform for artificial intelligence experimentation. In *International Joint Conference on Artificial Intelligence*. +[12] Singh, S. P., & Sutton, R. S. (1996). Reinforcement learning with replacing eligibility traces. *Machine Learning, 22*(1-3), 123-158. +[13] Robbins, H. (1985). Some aspects of the sequential design of experiments. In *Herbert Robbins Selected Papers* (pp. 169-177). Springer New York. +[14] Whittle, P. (1988). Restless bandits: Activity allocation in a changing world. *Journal of Applied probability*, 287-298. +[15] Sutton, R. S. (1988). Learning to predict by the methods of temporal differences. *Machine Learning, 3*(1), 9-44. +[16] Dietterich, T. G. (2000). Hierarchical Reinforcement Learning with the MAXQ Value Function Decomposition. In *Journal of Artificial Intelligence Research*. +[17] Garnelo, M., Arulkumaran, K., & Shanahan, M. (2016). Towards Deep Symbolic Reinforcement Learning. In *Workshop on Deep Reinforcement Learning, NIPS 2016*. diff --git a/experiment.lua b/experiment.lua index a184662..fabd910 100644 --- a/experiment.lua +++ b/experiment.lua @@ -1,38 +1,31 @@ -local image = require 'image' -local Catch = require 'rlenvs/Catch' - --- Detect QT for image display -local qt = pcall(require, 'qt') +local Catch = require 'rlenvs.Catch' -- Initialise and start environment -local env = Catch({level = 2}) -local stateSpec = env:getStateSpec() -local actionSpec = env:getActionSpec() +local env = Catch({level = 2, render = true, zoom = 10}) +local actionSpace = env:getActionSpace() local observation = env:start() -local reward, terminal +local reward, terminal = 0, false local episodes, totalReward = 0, 0 -local nSteps = 1000 * (stateSpec[2][2] - 1) -- Run for 1000 episodes +local nEpisodes = 1000 -- Display -local window = qt and image.display({image=observation, zoom=10}) +env:render() -for i = 1, nSteps do - -- Pick random action and execute it - local action = torch.random(actionSpec[3][1], actionSpec[3][2]) - reward, observation, terminal = env:step(action) - totalReward = totalReward + reward +for i = 1, nEpisodes do + while not terminal do + -- Pick random action and execute it + local action = torch.random(0, actionSpace['n'] - 1) + reward, observation, terminal = env:step(action) + totalReward = totalReward + reward - -- Display - if qt then - image.display({image=observation, zoom=10, win=window}) + -- Display + env:render() end - -- If game finished, start again - if terminal then - episodes = episodes + 1 - observation = env:start() - end + episodes = episodes + 1 + observation = env:start() + terminal = false end print('Episodes: ' .. episodes) print('Total Reward: ' .. totalReward) diff --git a/rlenvs/Acrobot.lua b/rlenvs/Acrobot.lua index 38af8ff..2d8d693 100644 --- a/rlenvs/Acrobot.lua +++ b/rlenvs/Acrobot.lua @@ -1,11 +1,14 @@ local classic = require 'classic' local Acrobot, super = classic.class('Acrobot', Env) +Acrobot.timeStepLimit = 500 -- Constructor function Acrobot:_init(opts) opts = opts or {} - + opts.timeStepLimit = Acrobot.timeStepLimit + super._init(self, opts) + -- Constants self.g = opts.g or 9.8 self.m1 = opts.m1 or 1 -- Mass of link 1 @@ -21,27 +24,40 @@ function Acrobot:_init(opts) end -- 4 states returned, of type 'real', of dimensionality 1, with differing ranges -function Acrobot:getStateSpec() - return { - {'real', 1, {-math.pi, math.pi}}, -- Joint 1 angle - {'real', 1, {-math.pi, math.pi}}, -- Joint 2 angle - {'real', 1, {-4*math.pi, 4*math.pi}}, -- Joint 1 angular velocity - {'real', 1, {-9*math.pi, 9*math.pi}} -- Joint 2 angular velocity +function Acrobot:getStateSpace() + local state = {} + state['name'] = 'Box' + state['shape'] = {4} + state['low'] = { + -math.pi, -- Joint 1 angle + -math.pi, -- Joint 2 angle + -4 * math.pi, -- Joint 1 angular velocity + -9 * math.pi -- Joint 2 angular velocity + } + state['high'] = { + math.pi, -- Joint 1 angle + math.pi, -- Joint 2 angle + 4 * math.pi, -- Joint 1 angular velocity + 9 * math.pi -- Joint 2 angular velocity } + return state end -- 1 action required, of type 'int', of dimensionality 1, with second torque joint in {-1, 0, 1} -function Acrobot:getActionSpec() - return {'int', 1, {-1, 1}} +function Acrobot:getActionSpace() + local action = {} + action['name'] = 'Discrete' + action['n'] = 3 + return action end -- Min and max reward -function Acrobot:getRewardSpec() +function Acrobot:getRewardSpace() return -1, 0 end -- Resets the cart -function Acrobot:start() +function Acrobot:_start() -- Reset angles and velocities self.q1 = 0 -- Joint 1 angle self.q2 = 0 -- Joint 2 angle @@ -52,20 +68,19 @@ function Acrobot:start() end -- Swings the pole via torque on second joint -function Acrobot:step(action) +function Acrobot:_step(action) + action = action - 1 -- rescale the action local reward = -1 local terminal = false for t = 1, self.steps do -- Calculate motion of system - local d1 = self.m1*math.pow(self.lc1, 2) + self.m2*(math.pow(self.l1, 2) + math.pow(self.lc2, 2) + 2*self.l1*self.lc2*math.cos(self.q2)) + self.I1 + self.I2 - local d2 = self.m2*(math.pow(self.lc2, 2) + self.l1*self.lc2*math.cos(self.q2)) + self.I2 - local phi2 = self.m2*self.lc2*self.g*math.cos(self.q1 + self.q2 - math.pi/2) - local phi1 = -self.m2*self.l1*self.lc2*math.pow(self.q2Dot, 2)*math.sin(self.q2) - 2*self.m2*self.l1*self.lc2*self.q2Dot*self.q1Dot*math.sin(self.q2) + - (self.m1*self.lc1 + self.m2*self.l1)*self.g*math.cos(self.q1 - math.pi/2) + phi2 - local q2DotDot = (action + d2/d1*phi1 - self.m2*self.l1*self.lc2*math.pow(self.q1Dot, 2)*math.sin(self.q2) - phi2) / - (self.m2*math.pow(self.lc2, 2) + self.I2 - math.pow(d2, 2)/d1) - local q1DotDot = -(d2/q2DotDot + phi1)/d1 + local d1 = self.m1 * math.pow(self.lc1, 2) + self.m2 * (math.pow(self.l1, 2) + math.pow(self.lc2, 2) + 2 * self.l1 * self.lc2 * math.cos(self.q2)) + self.I1 + self.I2 + local d2 = self.m2 * (math.pow(self.lc2, 2) + self.l1 * self.lc2 * math.cos(self.q2)) + self.I2 + local phi2 = self.m2 * self.lc2 * self.g * math.cos(self.q1 + self.q2 - math.pi/2) + local phi1 = -self.m2 * self.l1 * self.lc2 * math.pow(self.q2Dot, 2) * math.sin(self.q2) - 2 * self.m2 * self.l1 * self.lc2 * self.q2Dot * self.q1Dot * math.sin(self.q2) + (self.m1 * self.lc1 + self.m2 * self.l1) * self.g * math.cos(self.q1 - math.pi / 2) + phi2 + local q2DotDot = (action + d2 / d1 * phi1 - self.m2 * self.l1 * self.lc2 * math.pow(self.q1Dot, 2) * math.sin(self.q2) - phi2) / (self.m2 * math.pow(self.lc2, 2) + self.I2 - math.pow(d2, 2) / d1) + local q1DotDot = -(d2 / q2DotDot + phi1) / d1 -- Update state using Euler's method self.q1Dot = self.q1Dot + self.tau * q1DotDot @@ -86,13 +101,13 @@ function Acrobot:step(action) self.q2 = math.pi - (self.q2 % -math.pi) end -- Limit velocities - self.q1Dot = math.max(self.q1Dot, -4*math.pi) - self.q1Dot = math.min(self.q1Dot, 4*math.pi) - self.q2Dot = math.max(self.q2Dot, -9*math.pi) - self.q2Dot = math.min(self.q2Dot, 9*math.pi) + self.q1Dot = math.max(self.q1Dot, -4 * math.pi) + self.q1Dot = math.min(self.q1Dot, 4 * math.pi) + self.q2Dot = math.max(self.q2Dot, -9 * math.pi) + self.q2Dot = math.min(self.q2Dot, 9 * math.pi) -- Terminate if second joint's height is greater than height of first joint (relative to origin) - local h = -self.l1*math.cos(self.q1) - self.l2*math.sin(math.pi/2 - self.q1 - self.q2) + local h = -self.l1 * math.cos(self.q1) - self.l2 * math.sin(math.pi / 2 - self.q1 - self.q2) if h > self.l1 then reward = 0 terminal = true diff --git a/rlenvs/Atari.lua b/rlenvs/Atari.lua index 83e1ea8..23cb39c 100644 --- a/rlenvs/Atari.lua +++ b/rlenvs/Atari.lua @@ -6,11 +6,15 @@ if not hasALEWrap then end local Atari, super = classic.class('Atari', Env) +Atari.timeStepLimit = 100000 -- Constructor function Atari:_init(opts) -- Create ALEWrap options from opts opts = opts or {} + opts.timeStepLimit = Atari.timeStepLimit + super._init(self, opts) + if opts.lifeLossTerminal == nil then opts.lifeLossTerminal = true end @@ -44,13 +48,25 @@ function Atari:_init(opts) end -- 1 state returned, of type 'real', of dimensionality 3 x 210 x 160, between 0 and 1 -function Atari:getStateSpec() - return {'real', {3, 210, 160}, {0, 1}} +function Atari:getStateSpace() + local state = {} + state['name'] = 'Box' + state['shape'] = {3, 210, 160} + state['low'] = { + 0 + } + state['high'] = { + 1 + } + return state end -- 1 action required, of type 'int', of dimensionality 1, between 1 and 18 (max) -function Atari:getActionSpec() - return {'int', 1, {1, #self.actions}} +function Atari:getActionSpace() + local action = {} + action['name'] = 'Discrete' + action['n'] = #self.actions + return action end -- RGB screen of height 210 and width 160 @@ -59,12 +75,12 @@ function Atari:getDisplaySpec() end -- Min and max reward (unknown) -function Atari:getRewardSpec() +function Atari:getRewardSpace() return nil, nil end -- Starts a new game, possibly with a random number of no-ops -function Atari:start() +function Atari:_start() local screen, reward, terminal if self.gameEnv._random_starts > 0 then @@ -77,7 +93,7 @@ function Atari:start() end -- Steps in a game -function Atari:step(action) +function Atari:_step(action) -- Map action index to action for game action = self.actions[action] diff --git a/rlenvs/Blackjack.lua b/rlenvs/Blackjack.lua index 7d436c4..b174455 100644 --- a/rlenvs/Blackjack.lua +++ b/rlenvs/Blackjack.lua @@ -7,31 +7,45 @@ local Blackjack, super = classic.class('Blackjack', Env) function Blackjack:_init(opts) opts = opts or {} + super._init(self, opts) + -- Create number-only suit self.suit = torch.Tensor({2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 11}) end -- 2 states returned, of type 'int', of dimensionality 1, for the player sum, dealer's showing card, and player-usable ace -function Blackjack:getStateSpec() - return { - {'int', 1, {2, 20}}, - {'int', 1, {1, 10}}, - {'int', 1, {0, 1}} +function Blackjack:getStateSpace() + local state = {} + state['name'] = 'Box' + state['shape'] = {3} + state['low'] = { + 2, + 1, + 0 + } + state['high'] = { + 20, + 10, + 1 } + return state end -- 1 action required, of type 'int', of dimensionality 1, either stand or hit -function Blackjack:getActionSpec() - return {'int', 1, {0, 1}} +function Blackjack:getActionSpace() + local action = {} + action['name'] = 'Discrete' + action['n'] = 2 + return action end -- Min and max reward -function Blackjack:getRewardSpec() +function Blackjack:getRewardSpace() return -1, 1 end -- Draw 2 cards for player and dealer -function Blackjack:start() +function Blackjack:_start() -- Shuffle deck self.deck = torch.cat({self.suit, self.suit, self.suit, self.suit}, 1):index(1, torch.randperm(52):long()) @@ -51,7 +65,7 @@ function Blackjack:start() end -- Player stands or hits -function Blackjack:step(action) +function Blackjack:_step(action) local reward = 0 local terminal = false diff --git a/rlenvs/CartPole.lua b/rlenvs/CartPole.lua index 5b1e49b..4a99613 100644 --- a/rlenvs/CartPole.lua +++ b/rlenvs/CartPole.lua @@ -1,11 +1,14 @@ local classic = require 'classic' local CartPole, super = classic.class('CartPole', Env) +CartPole.timeStepLimit = 200 -- Constructor function CartPole:_init(opts) opts = opts or {} - + opts.timeStepLimit = CartPole.timeStepLimit + super._init(self, opts) + -- Constants self.gravity = opts.gravity or 9.8 self.cartMass = opts.cartMass or 1.0 @@ -19,27 +22,40 @@ function CartPole:_init(opts) end -- 4 states returned, of type 'real', of dimensionality 1, with differing ranges -function CartPole:getStateSpec() - return { - {'real', 1, {-2.4, 2.4}}, -- Cart position - {'real', 1, {nil, nil}}, -- Cart velocity - {'real', 1, {math.rad(-12), math.rad(12)}}, -- Pole angle - {'real', 1, {nil, nil}} -- Pole angular velocity +function CartPole:getStateSpace() + local state = {} + state['name'] = 'Box' + state['shape'] = {4} + state['low'] = { + -2.4, -- Cart position + math.huge, -- Cart velocity + math.rad(-12), -- Pole angle + math.huge -- Pole angular velocity + } + state['high'] = { + 2.4, -- Cart position + math.huge, -- Cart velocity + math.rad(12), -- Pole angle + math.huge -- Pole angular velocity } + return state end -- 1 action required, of type 'int', of dimensionality 1, between 0 and 1 (left, right) -function CartPole:getActionSpec() - return {'int', 1, {0, 1}} +function CartPole:getActionSpace() + local action = {} + action['name'] = 'Discrete' + action['n'] = 2 + return action end -- Min and max reward -function CartPole:getRewardSpec() +function CartPole:getRewardSpace() return -1, 0 end -- Resets the cart -function CartPole:start() +function CartPole:_start() -- Reset position, angle and velocities self.x = 0 -- Cart position (m) self.xDot = 0 -- Cart velocity @@ -50,14 +66,14 @@ function CartPole:start() end -- Drives the cart -function CartPole:step(action) +function CartPole:_step(action) -- Calculate acceleration local force = action == 1 and self.forceMagnitude or -self.forceMagnitude local cosTheta = math.cos(self.theta) local sinTheta = math.sin(self.theta) - local temp = (force + 0.5*self.poleMassLength * math.pow(self.thetaDot, 2) * sinTheta) / self.totalMass - local thetaDotDot = (self.gravity * sinTheta - cosTheta * temp) / (0.5*self.poleLength * (4/3 - self.poleMass * math.pow(cosTheta, 2) / self.totalMass)) - local xDotDot = temp - 0.5*self.poleMassLength * thetaDotDot * cosTheta / self.totalMass + local temp = (force + 0.5 * self.poleMassLength * math.pow(self.thetaDot, 2) * sinTheta) / self.totalMass + local thetaDotDot = (self.gravity * sinTheta - cosTheta * temp) / (0.5 * self.poleLength * (4 / 3 - self.poleMass * math.pow(cosTheta, 2) / self.totalMass)) + local xDotDot = temp - 0.5 * self.poleMassLength * thetaDotDot * cosTheta / self.totalMass -- Update state using Euler's method self.x = self.x + self.tau * self.xDot @@ -66,10 +82,10 @@ function CartPole:step(action) self.thetaDot = self.thetaDot + self.tau * thetaDotDot -- Check failure (if cart reaches sides of track/pole tips too much) - local reward = 0 + local reward = 1 local terminal = false if self.x < -2.4 or self.x > 2.4 or self.theta < math.rad(-12) or self.theta > math.rad(12) then - reward = -1 + reward = 0 terminal = true end diff --git a/rlenvs/Catch.lua b/rlenvs/Catch.lua index 40ba990..4fc87e0 100644 --- a/rlenvs/Catch.lua +++ b/rlenvs/Catch.lua @@ -1,10 +1,12 @@ local classic = require 'classic' +local Env = require 'rlenvs/Env' local Catch, super = classic.class('Catch', Env) -- Constructor function Catch:_init(opts) opts = opts or {} + super._init(self, opts) -- Difficulty level self.level = opts.level or 2 @@ -28,13 +30,25 @@ function Catch:_init(opts) end -- 1 state returned, of type 'int', of dimensionality 1 x self.size x self.size, between 0 and 1 -function Catch:getStateSpec() - return {'int', {1, self.size, self.size}, {0, 1}} +function Catch:getStateSpace() + local state = {} + state['name'] = 'Box' + state['shape'] = {1, self.size, self.size} + state['low'] = { + 0 + } + state['high'] = { + 1 + } + return state end -- 1 action required, of type 'int', of dimensionality 1, between 0 and 2 -function Catch:getActionSpec() - return {'int', 1, {0, 2}} +function Catch:getActionSpace() + local action = {} + action['name'] = 'Discrete' + action['n'] = 3 + return action end -- RGB screen of size self.size x self.size @@ -43,7 +57,7 @@ function Catch:getDisplaySpec() end -- Min and max reward -function Catch:getRewardSpec() +function Catch:getRewardSpace() return 0, 1 end @@ -64,7 +78,7 @@ function Catch:redraw() end -- Starts new game -function Catch:start() +function Catch:_start() -- Reset player and ball self.player.x = math.ceil(self.size / 2) self.ball.x = torch.random(self.size) @@ -80,7 +94,7 @@ function Catch:start() end -- Steps in a game -function Catch:step(action) +function Catch:_step(action) -- Reward is 0 by default local reward = 0 diff --git a/rlenvs/CliffWalking.lua b/rlenvs/CliffWalking.lua index 41d029f..4200e4f 100644 --- a/rlenvs/CliffWalking.lua +++ b/rlenvs/CliffWalking.lua @@ -5,35 +5,48 @@ local CliffWalking, super = classic.class('CliffWalking', Env) -- Constructor function CliffWalking:_init(opts) opts = opts or {} + super._init(self, opts) end -- 2 states returned, of type 'int', of dimensionality 1, where x is 1-12 and y is 1-4 -function CliffWalking:getStateSpec() - return { - {'int', 1, {1, 12}}, -- x - {'int', 1, {1, 4}} -- y +function CliffWalking:getStateSpace() + local state = {} + state['name'] = 'Box' + state['shape'] = {2} + state['low'] = { + 1, -- x + 1 -- y } + state['high'] = { + 12, -- x + 4 -- y + } + return state end -- 1 action required, of type 'int', of dimensionality 1, between 1 and 4 (up|right|down|left) -function CliffWalking:getActionSpec() - return {'int', 1, {1, 4}} +function CliffWalking:getActionSpace() + local action = {} + action['name'] = 'Discrete' + action['n'] = 4 + return action end -- Min and max reward -function CliffWalking:getRewardSpec() +function CliffWalking:getRewardSpace() return -100, -1 end -- Reset position -function CliffWalking:start() +function CliffWalking:_start() self.position = {1, 1} return self.position end -- Move up, right, down or left -function CliffWalking:step(action) +function CliffWalking:_step(action) + action = action + 1 -- scale action local reward = -1 local terminal = false diff --git a/rlenvs/DynaMaze.lua b/rlenvs/DynaMaze.lua index 42faaf0..c1b9ddb 100644 --- a/rlenvs/DynaMaze.lua +++ b/rlenvs/DynaMaze.lua @@ -5,6 +5,7 @@ local DynaMaze, super = classic.class('DynaMaze', Env) -- Constructor function DynaMaze:_init(opts) opts = opts or {} + super._init(self, opts) -- Set change: none|blocking|shortcut self.change = opts.change or 'none' @@ -27,25 +28,36 @@ function DynaMaze:_init(opts) end -- 2 states returned, of type 'int', of dimensionality 1, where x is 1-9 and y is 1-6 -function DynaMaze:getStateSpec() - return { - {'int', 1, {1, 9}}, -- x - {'int', 1, {1, 6}} -- y +function DynaMaze:getStateSpace() + local state = {} + state['name'] = 'Box' + state['shape'] = {2} + state['low'] = { + 1, -- x + 1 -- y } + state['high'] = { + 9, -- x + 6 -- y + } + return state end -- 1 action required, of type 'int', of dimensionality 1, between 1 and 4 -function DynaMaze:getActionSpec() - return {'int', 1, {1, 4}} +function DynaMaze:getActionSpace() + local action = {} + action['name'] = 'Discrete' + action['n'] = 4 + return action end -- Min and max reward -function DynaMaze:getRewardSpec() +function DynaMaze:getRewardSpace() return 0, 1 end -- Reset position -function DynaMaze:start() +function DynaMaze:_start() if self.change == 'none' then self.position = {1, 4} else @@ -56,7 +68,8 @@ function DynaMaze:start() end -- Move up, right, down or left -function DynaMaze:step(action) +function DynaMaze:_step(action) + action = action + 1 -- scale action local reward = 0 local terminal = false diff --git a/rlenvs/Env.lua b/rlenvs/Env.lua index 262c5f4..824a31a 100644 --- a/rlenvs/Env.lua +++ b/rlenvs/Env.lua @@ -3,10 +3,58 @@ local classic = require 'classic' local Env = classic.class('Env') -- Denote interfaces -Env:mustHave('start') -Env:mustHave('step') -Env:mustHave('getStateSpec') -Env:mustHave('getActionSpec') -Env:mustHave('getRewardSpec') +Env:mustHave('_start') +Env:mustHave('_step') +Env:mustHave('getStateSpace') +Env:mustHave('getActionSpace') +Env:mustHave('getRewardSpace') + +function Env:_init(opts) + -- Set max number of steps per episode (default 1000) + if opts.timeStepLimit and opts.maxSteps then + self.maxSteps = math.min(opts.timeStepLimit, opts.maxSteps) + elseif opts.maxSteps then + self.maxSteps = opts.maxSteps + elseif opts.timeStepLimit then + self.maxSteps = opts.timeStepLimit + else + self.maxSteps = 1000 + end + self.currentStep = 1 + + if opts.render then + require 'image' + self.qt = pcall(require, 'qt') + if not self.qt then + print('Was not able to load qt to render, are you using qlua to run the script?') + end + self.zoom = opts.zoom or 1 + end +end + +function Env:step(action) + local reward, state, terminal = self:_step(action) + + if self.currentStep == self.maxSteps then + terminal = true + self.currentStep = 0 + end + self.currentStep = self.currentStep + 1 + + return reward, state, terminal +end + +function Env:start() + self.currentStep = 1 + local state = self:_start() + return state +end + +function Env:render() + if self.qt and self.getDisplay then + self.window = self.window == nil and image.display({image = self:getDisplay(), zoom = self.zoom}) or self.window + image.display({image = self:getDisplay(), zoom = self.zoom, win = self.window}) + end +end return Env diff --git a/rlenvs/GridWorld.lua b/rlenvs/GridWorld.lua index 66fba69..8bbf2b2 100644 --- a/rlenvs/GridWorld.lua +++ b/rlenvs/GridWorld.lua @@ -6,6 +6,7 @@ local GridWorld, super = classic.class('GridWorld', Env) -- Constructor function GridWorld:_init(opts) opts = opts or {} + super._init(self, opts) -- Cost of moving in world (discretized) self.world = torch.Tensor(101, 101):fill(-0.5) @@ -13,38 +14,50 @@ function GridWorld:_init(opts) -- PuddleWorld if opts.puddles then -- Create 2D Gaussians to subtract from world - self.world[{{30, 90}, {30, 50}}]:csub(image.gaussian({width=21, height=61})) - self.world[{{60, 80}, {1, 50}}]:csub(image.gaussian({width=60, height=21})[{{}, {11, 60}}]) + self.world[{{30, 90}, {30, 50}}]:csub(image.gaussian({width = 21, height = 61})) + self.world[{{60, 80}, {1, 50}}]:csub(image.gaussian({width = 60, height = 21})[{{}, {11, 60}}]) end end -- 2 states returned, of type 'real', of dimensionality 1, from 0-1 -function GridWorld:getStateSpec() - return { - {'real', 1, {0, 1}}, -- x - {'real', 1, {0, 1}} -- y +function GridWorld:getStateSpace() + local state = {} + state['name'] = 'Box' + state['shape'] = {2} + state['low'] = { + 0, -- x + 0 -- y } + state['high'] = { + 1, -- x + 1 -- y + } + return state end -- 1 action required, of type 'int', of dimensionality 1, between 1 and 4 -function GridWorld:getActionSpec() - return {'int', 1, {1, 4}} +function GridWorld:getActionSpace() + local action = {} + action['name'] = 'Discrete' + action['n'] = 4 + return action end -- Min and max reward -function GridWorld:getRewardSpec() +function GridWorld:getRewardSpace() return torch.min(self.world), 0 end -- Reset position -function GridWorld:start() +function GridWorld:_start() self.position = {0.2, 0.4} return self.position end -- Move up, right, down or left -function GridWorld:step(action) +function GridWorld:_step(action) + action = action + 1 -- scale action local terminal = false -- Move @@ -63,7 +76,7 @@ function GridWorld:step(action) end -- Look up cost of moving to position - local reward = self.world[{{self.position[1]*100+1}, {self.position[2]*100+1}}][1][1] + local reward = self.world[{{self.position[1] * 100 + 1}, {self.position[2] * 100 + 1}}][1][1] -- Check if reached goal if self.position[1] == 1 and self.position[2] == 1 then diff --git a/rlenvs/JacksCarRental.lua b/rlenvs/JacksCarRental.lua index fa9d3a3..a21719c 100644 --- a/rlenvs/JacksCarRental.lua +++ b/rlenvs/JacksCarRental.lua @@ -19,28 +19,40 @@ end -- Constructor function JacksCarRental:_init(opts) opts = opts or {} + super._init(self, opts) end -- 2 states returned, of type 'int', of dimensionality 1, for 0-20 cars -function JacksCarRental:getStateSpec() - return { - {'int', 1, {0, 20}}, -- Lot 1 - {'int', 1, {0, 20}} -- Lot 2 +function JacksCarRental:getStateSpace() + local state = {} + state['name'] = 'Box' + state['shape'] = {2} + state['low'] = { + 0, -- Lot 1 + 0 -- Lot 2 } + state['high'] = { + 20, -- Lot 1 + 20 -- Lot 2 + } + return state end -- 1 action required, of type 'int', of dimensionality 1, between -5 and 5 (max 5 cars can be moved overnight) -function JacksCarRental:getActionSpec() - return {'int', 1, {-5, 5}} -- Negative numbers indicate transferring cars from lot 2 to lot 1 +function JacksCarRental:getActionSpace() + local action = {} + action['name'] = 'Discrete' + action['n'] = 10 + return action end -- Min and max reward -function JacksCarRental:getRewardSpec() +function JacksCarRental:getRewardSpace() return 0, 200 end -- Resets the cars to 10 at each lot -function JacksCarRental:start() +function JacksCarRental:_start() self.lot1 = 10 self.lot2 = 10 @@ -48,18 +60,19 @@ function JacksCarRental:start() end -- Acts out a day and night for Jack's Car Rental -function JacksCarRental:step(action) +function JacksCarRental:_step(action) + action = action - 5 -- scale action local reward = 0 -- Reward in $ -- Customers rent cars from lot 1 during the day local lot1Rentals = math.min(poisson(3), self.lot1) self.lot1 = self.lot1 - lot1Rentals - reward = reward + 10*lot1Rentals + reward = reward + 10 * lot1Rentals -- Customers rent cars from lot 2 during the day local lot2Rentals = math.min(poisson(4), self.lot2) self.lot2 = self.lot2 - lot2Rentals - reward = reward + 10*lot2Rentals + reward = reward + 10 * lot2Rentals -- Customers return cars to lot 1 at the end of the day local lot1Returns = poisson(3) @@ -77,14 +90,14 @@ function JacksCarRental:step(action) -- Move cars self.lot1 = self.lot1 - carsMoved self.lot2 = self.lot2 + carsMoved - reward = reward - 2*carsMoved - elseif action < 0 then + reward = reward - 2 * carsMoved + elseif action < 0 then -- Negative numbers indicate transferring cars from lot 2 to lot 1 carsMoved = math.min(-action, self.lot2) carsMoved = math.min(carsMoved, 20 - self.lot1) -- Move cars self.lot2 = self.lot2 - carsMoved self.lot1 = self.lot1 + carsMoved - reward = reward - 2*carsMoved + reward = reward - 2 * carsMoved end return reward, {self.lot1, self.lot2}, false diff --git a/rlenvs/Minecraft.lua b/rlenvs/Minecraft.lua index 8997462..f31d850 100644 --- a/rlenvs/Minecraft.lua +++ b/rlenvs/Minecraft.lua @@ -16,12 +16,14 @@ end -- Constructor function Minecraft:_init(opts) + opts = opts or {} + super._init(self, opts) -- Check libaMalmoLua is available locally if not hasLibMalmoLua then - print("Requires libMalmoLua.so in LUA_CPATH") + print("Requires libMalmoLua.so") os.exit() end - + opts = opts or {} self.height = opts.height or 84 self.width = opts.width or 84 @@ -33,14 +35,14 @@ function Minecraft:_init(opts) - - - clear - false - + + + clear + false + @@ -69,7 +71,7 @@ function Minecraft:_init(opts) 1 - + @@ -77,12 +79,12 @@ function Minecraft:_init(opts) James Bond - + - 160 - 160 + 320 + 240 @@ -90,10 +92,11 @@ function Minecraft:_init(opts) attack - - - - + + + + + @@ -120,16 +123,28 @@ function Minecraft:_init(opts) end -- 2 states returned, of type 'real', of dimensionality 1, from 0-1 -function Minecraft:getStateSpec() - return {'real', {3, self.height, self.width}, {0, 1}} +function Minecraft:getStateSpace() + local state = {} + state['name'] = 'Box' + state['shape'] = {3, self.height, self.width} + state['low'] = { + 0 + } + state['high'] = { + 1 + } + return state end -function Minecraft:getActionSpec() - return {'int', 1, {1, #self.actions}} +function Minecraft:getActionSpace() + local action = {} + action['name'] = 'Discrete' + action['n'] = #self.actions + return action end --- Min and max reward -function Minecraft:getRewardSpec() +-- Min and max reward (unknown) +function Minecraft:getRewardSpace() return nil, nil end @@ -157,8 +172,16 @@ function Minecraft:getRewards(world_rewards) return proc_rewards end --- Reset position -function Minecraft:start() +-- Start new mission +function Minecraft:_start() + local world_state = self.agent_host:getWorldState() + + -- check if a previous mission is still running before starting a new one + if world_state.is_mission_running then + self.agent_host:sendCommand("quit") + sleep(0.5) + end + local mission = MissionSpec(self.mission_xml, true) local mission_record = MissionRecordSpec() @@ -173,9 +196,6 @@ function Minecraft:start() assert(channels == 3, "No RGB video output") assert(height == self.height or width == self.width, "Video output dimensions don't match those requested") - -- Set the time limit for mission (in seconds) - mission:timeLimitInSeconds(self.time_limit) - local status, err = pcall(function() self.agent_host:startMission( mission, mission_record ) end) if not status then print("Error starting mission: "..err) @@ -216,7 +236,7 @@ function Minecraft:start() end -- Move up, right, down or left -function Minecraft:step(action) +function Minecraft:_step(action) -- Do something local action = self.actions[action] self.agent_host:sendCommand(action) @@ -247,7 +267,7 @@ function Minecraft:step(action) self.proc_frames = self:processFrames(world_state.video_frames) end - local terminal = world_state.is_mission_running + local terminal = not world_state.is_mission_running sleep(0.1) diff --git a/rlenvs/MountainCar.lua b/rlenvs/MountainCar.lua index 8d62109..26daec2 100644 --- a/rlenvs/MountainCar.lua +++ b/rlenvs/MountainCar.lua @@ -1,32 +1,47 @@ local classic = require 'classic' local MountainCar, super = classic.class('MountainCar', Env) +MountainCar.timeStepLimit = 200 -- Constructor function MountainCar:_init(opts) opts = opts or {} + opts.timeStepLimit = MountainCar.timeStepLimit + + super._init(self, opts) end -- 2 states returned, of type 'real', of dimensionality 1, with differing ranges -function MountainCar:getStateSpec() - return { - {'real', 1, {-0.07, 0.07}}, -- Velocity - {'real', 1, {-1.2, 0.6}} -- Position +function MountainCar:getStateSpace() + local state = {} + state['name'] = 'Box' + state['shape'] = {2} + state['low'] = { + -0.07, -- Velocity + -1.2 -- Position + } + state['high'] = { + 0.07, -- Velocity + 0.6 -- Position } + return state end -- 1 action required, of type 'int', of dimensionality 1, between -1 and 1 (left, neutral, right) -function MountainCar:getActionSpec() - return {'int', 1, {-1, 1}} +function MountainCar:getActionSpace() + local action = {} + action['name'] = 'Discrete' + action['n'] = 3 + return action end -- Min and max reward -function MountainCar:getRewardSpec() +function MountainCar:getRewardSpace() return -2, 0 -- As height = sin(3x) is between -1 and 1, and reward = height - 1 end -- Resets the car -function MountainCar:start() +function MountainCar:_start() -- Reset position and velocity self.position = -0.5 self.velocity = 0 @@ -35,12 +50,13 @@ function MountainCar:start() end -- Drives the car -function MountainCar:step(action) +function MountainCar:_step(action) + action = action - 1 -- scale action -- Calculate height local height = math.sin(3*self.position) -- Update velocity and position - self.velocity = self.velocity + 0.001*action - 0.0025*math.cos(3*self.position) + self.velocity = self.velocity + 0.001 * action - 0.0025 * math.cos(3 * self.position) self.velocity = math.max(self.velocity, -0.07) self.velocity = math.min(self.velocity, 0.07) self.position = self.position + self.velocity diff --git a/rlenvs/MultiArmedBandit.lua b/rlenvs/MultiArmedBandit.lua index 9261221..d4e6c75 100644 --- a/rlenvs/MultiArmedBandit.lua +++ b/rlenvs/MultiArmedBandit.lua @@ -5,6 +5,7 @@ local MultiArmedBandit, super = classic.class('MultiArmedBandit', Env) -- Constructor function MultiArmedBandit:_init(opts) opts = opts or {} + super._init(self, opts) -- Restless bandits (with a Gaussian random walk) self.restless = opts.restless or false @@ -19,27 +20,31 @@ function MultiArmedBandit:_init(opts) end -- No state (not a contextual bandit) -function MultiArmedBandit:getStateSpec() +function MultiArmedBandit:getStateSpace() return nil end -- 1 action required, of type 'int', of dimensionality 1, of the number of arms -function MultiArmedBandit:getActionSpec() - return {'int', 1, {1, self.nArms}} +function MultiArmedBandit:getActionSpace() + local action = {} + action['name'] = 'Discrete' + action['n'] = self.nArms + return action end -- Min and max rewards unknown when sampling from distributions -function MultiArmedBandit:getRewardSpec() +function MultiArmedBandit:getRewardSpace() return nil, nil end -- Does nothing (distributions do not reset) -function MultiArmedBandit:start() +function MultiArmedBandit:_start() return nil end -- Pulls an arm -function MultiArmedBandit:step(action) +function MultiArmedBandit:_step(action) + action = action + 1 -- scale action -- Sample for reward local reward = torch.normal(self.armMeans[action], 1) diff --git a/rlenvs/RandomWalk.lua b/rlenvs/RandomWalk.lua index 8594330..a1df9b2 100644 --- a/rlenvs/RandomWalk.lua +++ b/rlenvs/RandomWalk.lua @@ -5,32 +5,39 @@ local RandomWalk, super = classic.class('RandomWalk', Env) -- Constructor function RandomWalk:_init(opts) opts = opts or {} + super._init(self, opts) end -- 1 states returned, of type 'int', of dimensionality 1, between 0 and 6 (the terminal states) -function RandomWalk:getStateSpec() - return {'int', 1, {0, 6}} -- Position +function RandomWalk:getStateSpace() + local state = {} + state['name'] = 'Discrete' + state['n'] = 6 + return state end -- 1 action required, of type 'int', of dimensionality 1, between 0 and 1 (left or right) -function RandomWalk:getActionSpec() - return {'int', 1, {0, 1}} +function RandomWalk:getActionSpace() + local action = {} + action['name'] = 'Discrete' + action['n'] = 2 + return action end -- Min and max reward -function RandomWalk:getRewardSpec() +function RandomWalk:getRewardSpace() return 0, 1 end -- Reset position -function RandomWalk:start() +function RandomWalk:_start() self.position = 3 return self.position end -- Move left or right -function RandomWalk:step(action) +function RandomWalk:_step(action) local reward = 0 local terminal = false diff --git a/rlenvs/Taxi.lua b/rlenvs/Taxi.lua index 61d0ba5..962b90c 100644 --- a/rlenvs/Taxi.lua +++ b/rlenvs/Taxi.lua @@ -12,10 +12,14 @@ local classic = require 'classic' --]] local Taxi, super = classic.class('Taxi', Env) +Taxi.timeStepLimit = 200 -- Constructor function Taxi:_init(opts) opts = opts or {} + opts.timeStepLimit = Taxi.timeStepLimit + + super._init(self, opts) -- Passenger positions (Red, Green, Blue, Yellow) self.rgbyPos = {{0, 4}, {4, 4}, {3, 0}, {0, 0}} @@ -24,28 +28,42 @@ function Taxi:_init(opts) end -- 4 states returned, of type 'int', of dimensionality 1, where x and y are 0-5, fuel is -1-12, passenger position is 1-5 and destination is 1-4 -function Taxi:getStateSpec() - return { - {'int', 1, {0, 4}}, -- x - {'int', 1, {0, 4}}, -- y - {'int', 1, {-1, 12}}, -- Fuel - {'int', 1, {1, 5}}, -- Passenger location - {'int', 1, {1, 4}}, -- Destination TODO: Work out why there are apparently 5 destination states in the original paper +function Taxi:getStateSpace() + local state = {} + state['name'] = 'Box' + state['shape'] = {5} + state['low'] = { + 0, -- x + 0, -- y + -1, -- Fuel + 1, -- Passenger location + 1 -- Destination TODO: Work out why there are apparently 5 destination states in the original paper + } + state['high'] = { + 4, -- x + 4, -- y + 12, -- Fuel + 5, -- Passenger location + 4 -- Destination } + return state end -- 1 action required, of type 'int', of dimensionality 1, where 1-4 is move N, E, S, W, 5 is Pickup, 6 is Putdown and 7 is Fillup -function Taxi:getActionSpec() - return {'int', 1, {1, 7}} +function Taxi:getActionSpace() + local action = {} + action['name'] = 'Discrete' + action['n'] = 7 + return action end -- Min and max reward -function Taxi:getRewardSpec() +function Taxi:getRewardSpace() return -20, 20 end -- Reset position, fuel and passenger -function Taxi:start() +function Taxi:_start() -- Randomise position and fuel self.position = {torch.random(0, 4), torch.random(0, 4)} self.fuel = torch.random(5, 12) @@ -90,7 +108,7 @@ function Taxi:validMove(action) end -- Move up, right, down or left -function Taxi:step(action) +function Taxi:_step(action) local reward = -1 local terminal = false diff --git a/rlenvs/WindyWorld.lua b/rlenvs/WindyWorld.lua index 9cba841..00fdce1 100644 --- a/rlenvs/WindyWorld.lua +++ b/rlenvs/WindyWorld.lua @@ -5,42 +5,56 @@ local WindyWorld, super = classic.class('WindyWorld', Env) -- Constructor function WindyWorld:_init(opts) opts = opts or {} + super._init(self, opts) -- Allow king's moves (8 directions) self.king = opts.king or false end -- 2 states returned, of type 'int', of dimensionality 1, where x is 1-10 and y is 1-7 -function WindyWorld:getStateSpec() - return { - {'int', 1, {1, 10}}, -- x - {'int', 1, {1, 7}} -- y +function WindyWorld:getStateSpace() + local state = {} + state['name'] = 'Box' + state['shape'] = {5} + state['low'] = { + 1, -- x + 1 -- y } + state['high'] = { + 10, -- x + 7 -- y + } + return state end -- 1 action required, of type 'int', of dimensionality 1, between 1 and 4 (for standard) or 1 and 8 (for king) -function WindyWorld:getActionSpec() +function WindyWorld:getActionSpace() + local action = {} + action['name'] = 'Discrete' if self.king then - return {'int', 1, {1, 8}} + action['n'] = 8 + return action else - return {'int', 1, {1, 4}} + action['n'] = 4 + return action end end -- Min and max reward -function WindyWorld:getRewardSpec() +function WindyWorld:getRewardSpace() return -1, -1 end -- Reset position -function WindyWorld:start() +function WindyWorld:_start() self.position = {1, 4} return self.position end -- Move up, right, down or left -function WindyWorld:step(action) +function WindyWorld:_step(action) + action = action + 1 -- scale action local terminal = false -- Move diff --git a/rlenvs/XOWorld.lua b/rlenvs/XOWorld.lua index 691d61c..72f0ed9 100644 --- a/rlenvs/XOWorld.lua +++ b/rlenvs/XOWorld.lua @@ -6,6 +6,7 @@ local XOWorld, super = classic.class('XOWorld', Env) -- Constructor function XOWorld:_init(opts) opts = opts or {} + super._init(self, opts) -- Game mode (all circles, negative, or circles and crosses, negative and positive) self.double = opts.double or false @@ -102,24 +103,36 @@ function XOWorld:_init(opts) {67, 7}, {67, 27}, {67, 47}, {67, 67}} end --- 1 state returned, of type 'int', of dimensionality 1 x self.size x self.size, between 0 and 1 -function XOWorld:getStateSpec() - return {'int', {1, self.size, self.size}, {0, 1}} +-- 1 state returned, of type 'real', of dimensionality 3 x 210 x 160, between 0 and 1 +function XOWorld:getStateSpace() + local state = {} + state['name'] = 'Box' + state['shape'] = {1, self.size, self.size} + state['low'] = { + 0 + } + state['high'] = { + 1 + } + return state end -- 1 action required, of type 'int', of dimensionality 1, between 0 and 3 -function XOWorld:getActionSpec() - return {'int', 1, {0, 3}} +function XOWorld:getActionSpace() + local action = {} + action['name'] = 'Discrete' + action['n'] = 4 + return action end -- RGB screen of size self.size x self.size function XOWorld:getDisplaySpec() - return {'real', {3, self.size, self.size}, {0, 1}} + return {'real', {3, self.size, self.size}, {0, 1}} end -- Min and max reward -function XOWorld:getRewardSpec() - return -10, 10 +function XOWorld:getRewardSpace() + return -10, 10 end -- Redraws screen based on state and performs collision detection @@ -188,7 +201,7 @@ function XOWorld:update() end -- Starts new game -function XOWorld:start() +function XOWorld:_start() -- Reset time self.time = 1 @@ -257,7 +270,7 @@ function XOWorld:start() end -- Steps in a game -function XOWorld:step(action) +function XOWorld:_step(action) -- Move player if action == 0 then self.x = math.max(self.x - 1, 1) diff --git a/rlenvs/init.lua b/rlenvs/init.lua index c29cfc9..ca94a8a 100644 --- a/rlenvs/init.lua +++ b/rlenvs/init.lua @@ -1,7 +1,7 @@ local rlenvs = {} -- Include environments -rlenvs.Env = require 'rlenvs/Env' +Env = require 'rlenvs/Env' rlenvs.Acrobot = require 'rlenvs/Acrobot' rlenvs.Atari = require 'rlenvs/Atari' rlenvs.Blackjack = require 'rlenvs/Blackjack' @@ -17,6 +17,7 @@ rlenvs.MultiArmedBandit = require 'rlenvs/MultiArmedBandit' rlenvs.RandomWalk = require 'rlenvs/RandomWalk' rlenvs.Taxi = require 'rlenvs/Taxi' rlenvs.WindyWorld = require 'rlenvs/WindyWorld' +rlenvs.XOWorld = require 'rlenvs/XOWorld' -- Remove nil environments for k, v in pairs(rlenvs) do @@ -25,4 +26,10 @@ for k, v in pairs(rlenvs) do end end +local envs ={} +for k,_ in pairs(rlenvs) do + envs[#envs+1]=k +end +rlenvs.envs = envs + return rlenvs diff --git a/tests/test.lua b/tests/test.lua new file mode 100644 index 0000000..32a36f2 --- /dev/null +++ b/tests/test.lua @@ -0,0 +1,41 @@ +require 'torch' +local rlenvs = require 'rlenvs' + +local test = torch.TestSuite() +local tester + +function test.envs() + for index, env in ipairs(rlenvs.envs) do + local function runTest() + local Env = require('rlenvs.' .. env) + -- Initialise and start environment + local env = Env() + local actionSpace = env:getActionSpace() + local observation = env:start() + -- Pick random action and execute it + local action = torch.random(0, actionSpace['n'] - 1) + local reward, observation, terminal = env:step(action) + -- Display if implemented + env:render() + end + + if env == 'Atari' then + local hasALEWrap = pcall(require, 'alewrap') + if not hasALEWrap then + tester:assert(pcall(runTest), 'Failed to run rlenv environment ' .. env) + end + elseif env == 'Minecraft' then + local hasSocket = pcall(require, 'socket') + local hasLibMalmoLua = pcall(require, 'libMalmoLua') + if not hasSocket and hasLibMalmoLua then + tester:assert(pcall(runTest), 'Failed to run rlenv environment ' .. env) + end + else + tester:assert(pcall(runTest), 'Failed to run rlenv environment ' .. env) + end + end +end + +tester = torch.Tester() +tester:add(test) +tester:run() \ No newline at end of file