Skip to content

Commit

Permalink
Upgrade week 6 reinforce homework to tf 2 (#447)
Browse files Browse the repository at this point in the history
* upgrade to tf 2

* add a suggestion for students

Co-authored-by: MichaelSolotky <>
  • Loading branch information
MichaelSolotky authored Aug 4, 2020
1 parent cdfbaa6 commit 0b89b6c
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 107 deletions.
9 changes: 9 additions & 0 deletions week06_policy_based/local_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/usr/bin/env bash

apt-get install -yqq ffmpeg
apt-get install -yqq python-opengl

python3 -m pip install --user gym==0.14.0
python3 -m pip install --user pygame
python3 -m pip install --user pyglet==1.3.2
python3 -m pip install --user tensorflow==2.0.0
200 changes: 93 additions & 107 deletions week06_policy_based/reinforce_tensorflow.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,40 @@
"Most of the code in this notebook is taken from approximate Q-learning, so you'll find it more or less familiar and even simpler."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"__Necessery dependencies:__\n",
"`ffmpeg`\n",
"`python-opengl`\n",
"`gym`\n",
"`pygame`\n",
"`pyglet`\n",
"`tensorflow==2.x`\n",
"\n",
"__Recomended dependencies:__\n",
"`gym==0.14.0`\n",
"`pyglet==1.3.2`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import sys, os\n",
"import os, sys\n",
"\n",
"if 'google.colab' in sys.modules:\n",
" %tensorflow_version 1.x\n",
" \n",
" if not os.path.exists('.setup_complete'):\n",
" !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n",
" !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/local_setup.sh -O- | bash\n",
" !touch .setup_complete\n",
"\n",
"# This code creates a virtual display to draw game images on.\n",
"# It will have no effect if your machine has a monitor.\n",
"if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n",
" !bash ../xvfb start\n",
" os.environ['DISPLAY'] = ':1'"
"else:\n",
" pass\n",
" # Uncomment this and execute for an automatic setup (may need sudo)\n",
" # !./local_setup.sh"
]
},
{
Expand All @@ -38,17 +53,12 @@
"metadata": {},
"outputs": [],
"source": [
"import gym\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A caveat: we have received reports that the following cell may crash with `NameError: name 'base' is not defined`. The [suggested workaround](https://www.coursera.org/learn/practical-rl/discussions/all/threads/N2Pw652iEemRYQ6W2GuqHg/replies/te3HpQwOQ62tx6UMDoOt2Q/comments/o08gTqelT9KPIE6npX_S3A) is to install `gym==0.14.0` and `pyglet==1.3.2`."
"# This code creates a virtual display to draw game images on.\n",
"# It will have no effect if your machine has a monitor.\n",
"\n",
"if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n",
" !bash ../xvfb start\n",
" os.environ['DISPLAY'] = ':1'"
]
},
{
Expand All @@ -57,6 +67,11 @@
"metadata": {},
"outputs": [],
"source": [
"import gym\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"import numpy as np\n",
"\n",
"env = gym.make(\"CartPole-v0\")\n",
"\n",
"# gym compatibility: unwrap TimeLimit\n",
Expand Down Expand Up @@ -95,46 +110,24 @@
"source": [
"import tensorflow as tf\n",
"\n",
"sess = tf.InteractiveSession()"
"model = <YOUR CODE: define network graph to predict policy logits\n",
" using raw TF, Keras, or any other library you prefer.\n",
" Keep it simple: CartPole is not worth deep architectures>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# create input variables. We only need <s, a, r> for REINFORCE\n",
"ph_states = tf.placeholder('float32', (None,) + state_dim, name=\"states\")\n",
"ph_actions = tf.placeholder('int32', name=\"action_ids\")\n",
"ph_cumulative_rewards = tf.placeholder('float32', name=\"cumulative_returns\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": [
"from keras.models import Sequential\n",
"from keras.layers import Dense\n",
"\n",
"<YOUR CODE: define network graph using raw TF, Keras, or any other library you prefer>\n",
"\n",
"logits = <YOUR CODE: symbolic outputs of your network _before_ softmax>\n",
"\n",
"policy = tf.nn.softmax(logits)\n",
"log_policy = tf.nn.log_softmax(logits)"
"#### Predict function"
]
},
{
"cell_type": "code",
"execution_count": null,
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": [
"# Initialize model parameters\n",
"sess.run(tf.global_variables_initializer())"
"Note: output value of this function is not a tf tensor, it's a numpy array.\n",
"So, here gradient calculation is not needed. If you wrote in pytorch, you would need something like `torch.no_grad` to avoid calculation of gradients. Tensorflow doesn't compute gradients at forward pass, so no additional actions needed here."
]
},
{
Expand All @@ -144,12 +137,15 @@
"outputs": [],
"source": [
"def predict_probs(states):\n",
" \"\"\" \n",
" \"\"\"\n",
" Predict action probabilities given states.\n",
" :param states: numpy array of shape [batch, state_shape]\n",
" :returns: numpy array of shape [batch, n_actions]\n",
" \"\"\"\n",
" return policy.eval({ph_states: [states]})[0]"
" states = <YOUR CODE: transform this numpy array into a tf tensor>\n",
" logits = model(states)\n",
" policy = <YOUR CODE: get policy and transform it into a numpy array>\n",
" return policy"
]
},
{
Expand All @@ -168,7 +164,7 @@
"outputs": [],
"source": [
"def generate_session(env, t_max=1000):\n",
" \"\"\" \n",
" \"\"\"\n",
" Play a full session with REINFORCE agent.\n",
" Returns sequences of states, actions, and rewards.\n",
" \"\"\"\n",
Expand All @@ -178,7 +174,7 @@
"\n",
" for t in range(t_max):\n",
" # action probabilities array aka pi(a|s)\n",
" action_probs = predict_probs(s)\n",
" action_probs = predict_probs(np.asarray([s]))[0]\n",
"\n",
" # Sample action with given probabilities.\n",
" a = <YOUR CODE>\n",
Expand Down Expand Up @@ -231,9 +227,9 @@
" gamma=0.99 # discount for reward\n",
" ):\n",
" \"\"\"\n",
" Take a list of immediate rewards r(s,a) for the whole session \n",
" Take a list of immediate rewards r(s,a) for the whole session\n",
" and compute cumulative returns (a.k.a. G(s,a) in Sutton '16).\n",
" \n",
"\n",
" G_t = r_t + gamma*r_{t+1} + gamma^2*r_{t+2} + ...\n",
"\n",
" A simple way to compute cumulative rewards is to iterate from the last\n",
Expand All @@ -251,7 +247,7 @@
"metadata": {},
"outputs": [],
"source": [
"assert len(get_cumulative_rewards(range(100))) == 100\n",
"assert len(get_cumulative_rewards(list(range(100)))) == 100\n",
"assert np.allclose(\n",
" get_cumulative_rewards([0, 0, 1, 0, 0, 1, 0], gamma=0.9),\n",
" [1.40049, 1.5561, 1.729, 0.81, 0.9, 1.0, 0.0])\n",
Expand Down Expand Up @@ -293,22 +289,12 @@
"metadata": {},
"outputs": [],
"source": [
"# This code selects the log-probabilities (log pi(a_i|s_i)) for those actions that were actually played.\n",
"indices = tf.stack([tf.range(tf.shape(log_policy)[0]), ph_actions], axis=-1)\n",
"log_policy_for_actions = tf.gather_nd(log_policy, indices)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Policy objective as in the last formula. Please use reduce_mean, not reduce_sum.\n",
"# You may use log_policy_for_actions to get log probabilities for actions taken.\n",
"# Also recall that we defined ph_cumulative_rewards earlier.\n",
"\n",
"J = <YOUR CODE>"
"def select_log_policy_for_actions(log_policy, actions):\n",
" # This code selects the log-probabilities (log pi(a_i|s_i))\n",
" # for those actions that were actually played.\n",
" indices = tf.stack([tf.range(tf.shape(log_policy)[0]), actions], axis=-1)\n",
" log_policy_for_actions = tf.gather_nd(log_policy, indices)\n",
" return log_policy_for_actions"
]
},
{
Expand All @@ -326,10 +312,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Entropy regularization. If you don't add it, the policy will quickly deteriorate to\n",
"# being deterministic, harming exploration.\n",
"\n",
"entropy = <YOUR CODE: compute entropy. Do not forget the sign!>"
"optimizer = <YOUR CODE: select your favorite optimizer, set its hyperparameters>"
]
},
{
Expand All @@ -338,39 +321,29 @@
"metadata": {},
"outputs": [],
"source": [
"# # Maximizing X is the same as minimizing -X, hence the sign.\n",
"loss = -(J + 0.1 * entropy)\n",
"\n",
"update = tf.train.AdamOptimizer().minimize(loss)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def train_on_session(states, actions, rewards, t_max=1000):\n",
"def train_on_session(states, actions, rewards):\n",
" \"\"\"given full session, trains agent with policy gradient\"\"\"\n",
" cumulative_rewards = get_cumulative_rewards(rewards)\n",
" update.run({\n",
" ph_states: states,\n",
" ph_actions: actions,\n",
" ph_cumulative_rewards: cumulative_rewards,\n",
" })\n",
" cumulative_returns = <YOUR CODE: get cumulative rewards>\n",
"\n",
" states = tf.keras.backend.constant(states)\n",
" cumulative_returns = tf.keras.backend.constant(cumulative_returns)\n",
" actions = tf.keras.backend.constant(actions, dtype='int32')\n",
"\n",
" with tf.GradientTape() as tape:\n",
" logits = <YOUR CODE>\n",
" policy = tf.nn.softmax(logits)\n",
" log_policy = tf.nn.log_softmax(logits)\n",
" log_policy_for_actions = <YOUR CODE>\n",
"\n",
" J = <YOUR CODE>\n",
" entropy = <YOUR CODE, mind the sign>\n",
" loss = -(J + 0.1 * entropy)\n",
" grads = tape.gradient(loss, model.trainable_variables)\n",
" optimizer.apply_gradients(zip(grads, model.trainable_variables))\n",
"\n",
" return sum(rewards)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Initialize optimizer parameters\n",
"sess.run(tf.global_variables_initializer())"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -438,9 +411,22 @@
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"pygments_lexer": "ipython3"
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 0b89b6c

Please sign in to comment.