Upgrade week 6 reinforce homework to tf 2 (#447)

* upgrade to tf 2 * add a suggestion for students Co-authored-by: MichaelSolotky <>
yandexdataschool · Aug 4, 2020 · 0b89b6c · 0b89b6c
1 parent cdfbaa6
commit 0b89b6c
Show file tree

Hide file tree

Showing 2 changed files with 102 additions and 107 deletions.
diff --git a/week06_policy_based/local_setup.sh b/week06_policy_based/local_setup.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+apt-get install -yqq ffmpeg
+apt-get install -yqq python-opengl
+
+python3 -m pip install --user gym==0.14.0
+python3 -m pip install --user pygame
+python3 -m pip install --user pyglet==1.3.2
+python3 -m pip install --user tensorflow==2.0.0
diff --git a/week06_policy_based/reinforce_tensorflow.ipynb b/week06_policy_based/reinforce_tensorflow.ipynb
@@ -11,25 +11,40 @@
     "Most of the code in this notebook is taken from approximate Q-learning, so you'll find it more or less familiar and even simpler."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "__Necessery dependencies:__\n",
+    "`ffmpeg`\n",
+    "`python-opengl`\n",
+    "`gym`\n",
+    "`pygame`\n",
+    "`pyglet`\n",
+    "`tensorflow==2.x`\n",
+    "\n",
+    "__Recomended dependencies:__\n",
+    "`gym==0.14.0`\n",
+    "`pyglet==1.3.2`"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import sys, os\n",
+    "import os, sys\n",
+    "\n",
     "if 'google.colab' in sys.modules:\n",
-    "    %tensorflow_version 1.x\n",
-    "    \n",
     "    if not os.path.exists('.setup_complete'):\n",
     "        !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n",
+    "        !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/local_setup.sh -O- | bash\n",
     "        !touch .setup_complete\n",
-    "\n",
-    "# This code creates a virtual display to draw game images on.\n",
-    "# It will have no effect if your machine has a monitor.\n",
-    "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n",
-    "    !bash ../xvfb start\n",
-    "    os.environ['DISPLAY'] = ':1'"
+    "else:\n",
+    "    pass\n",
+    "    # Uncomment this and execute for an automatic setup (may need sudo)\n",
+    "    # !./local_setup.sh"
    ]
   },
   {
@@ -38,17 +53,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import gym\n",
-    "import numpy as np\n",
-    "import matplotlib.pyplot as plt\n",
-    "%matplotlib inline"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "A caveat: we have received reports that the following cell may crash with `NameError: name 'base' is not defined`. The [suggested workaround](https://www.coursera.org/learn/practical-rl/discussions/all/threads/N2Pw652iEemRYQ6W2GuqHg/replies/te3HpQwOQ62tx6UMDoOt2Q/comments/o08gTqelT9KPIE6npX_S3A) is to install `gym==0.14.0` and `pyglet==1.3.2`."
+    "# This code creates a virtual display to draw game images on.\n",
+    "# It will have no effect if your machine has a monitor.\n",
+    "\n",
+    "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n",
+    "    !bash ../xvfb start\n",
+    "    os.environ['DISPLAY'] = ':1'"
    ]
   },
   {
@@ -57,6 +67,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import gym\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "import numpy as np\n",
+    "\n",
     "env = gym.make(\"CartPole-v0\")\n",
     "\n",
     "# gym compatibility: unwrap TimeLimit\n",
@@ -95,46 +110,24 @@
    "source": [
     "import tensorflow as tf\n",
     "\n",
-    "sess = tf.InteractiveSession()"
+    "model = <YOUR CODE: define network graph to predict policy logits\n",
+    "         using raw TF, Keras, or any other library you prefer.\n",
+    "         Keep it simple: CartPole is not worth deep architectures>"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# create input variables. We only need <s, a, r> for REINFORCE\n",
-    "ph_states = tf.placeholder('float32', (None,) + state_dim, name=\"states\")\n",
-    "ph_actions = tf.placeholder('int32', name=\"action_ids\")\n",
-    "ph_cumulative_rewards = tf.placeholder('float32', name=\"cumulative_returns\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "from keras.models import Sequential\n",
-    "from keras.layers import Dense\n",
-    "\n",
-    "<YOUR CODE: define network graph using raw TF, Keras, or any other library you prefer>\n",
-    "\n",
-    "logits = <YOUR CODE: symbolic outputs of your network _before_ softmax>\n",
-    "\n",
-    "policy = tf.nn.softmax(logits)\n",
-    "log_policy = tf.nn.log_softmax(logits)"
+    "#### Predict function"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "# Initialize model parameters\n",
-    "sess.run(tf.global_variables_initializer())"
+    "Note: output value of this function is not a tf tensor, it's a numpy array.\n",
+    "So, here gradient calculation is not needed. If you wrote in pytorch, you would need something like `torch.no_grad` to avoid calculation of gradients. Tensorflow doesn't compute gradients at forward pass, so no additional actions needed here."
    ]
   },
   {
@@ -144,12 +137,15 @@
    "outputs": [],
    "source": [
     "def predict_probs(states):\n",
-    "    \"\"\" \n",
+    "    \"\"\"\n",
     "    Predict action probabilities given states.\n",
     "    :param states: numpy array of shape [batch, state_shape]\n",
     "    :returns: numpy array of shape [batch, n_actions]\n",
     "    \"\"\"\n",
-    "    return policy.eval({ph_states: [states]})[0]"
+    "    states = <YOUR CODE: transform this numpy array into a tf tensor>\n",
+    "    logits = model(states)\n",
+    "    policy = <YOUR CODE: get policy and transform it into a numpy array>\n",
+    "    return policy"
    ]
   },
   {
@@ -168,7 +164,7 @@
    "outputs": [],
    "source": [
     "def generate_session(env, t_max=1000):\n",
-    "    \"\"\" \n",
+    "    \"\"\"\n",
     "    Play a full session with REINFORCE agent.\n",
     "    Returns sequences of states, actions, and rewards.\n",
     "    \"\"\"\n",
@@ -178,7 +174,7 @@
     "\n",
     "    for t in range(t_max):\n",
     "        # action probabilities array aka pi(a|s)\n",
-    "        action_probs = predict_probs(s)\n",
+    "        action_probs = predict_probs(np.asarray([s]))[0]\n",
     "\n",
     "        # Sample action with given probabilities.\n",
     "        a = <YOUR CODE>\n",
@@ -231,9 +227,9 @@
     "                           gamma=0.99  # discount for reward\n",
     "                           ):\n",
     "    \"\"\"\n",
-    "    Take a list of immediate rewards r(s,a) for the whole session \n",
+    "    Take a list of immediate rewards r(s,a) for the whole session\n",
     "    and compute cumulative returns (a.k.a. G(s,a) in Sutton '16).\n",
-    "    \n",
+    "\n",
     "    G_t = r_t + gamma*r_{t+1} + gamma^2*r_{t+2} + ...\n",
     "\n",
     "    A simple way to compute cumulative rewards is to iterate from the last\n",
@@ -251,7 +247,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "assert len(get_cumulative_rewards(range(100))) == 100\n",
+    "assert len(get_cumulative_rewards(list(range(100)))) == 100\n",
     "assert np.allclose(\n",
     "    get_cumulative_rewards([0, 0, 1, 0, 0, 1, 0], gamma=0.9),\n",
     "    [1.40049, 1.5561, 1.729, 0.81, 0.9, 1.0, 0.0])\n",
@@ -293,22 +289,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# This code selects the log-probabilities (log pi(a_i|s_i)) for those actions that were actually played.\n",
-    "indices = tf.stack([tf.range(tf.shape(log_policy)[0]), ph_actions], axis=-1)\n",
-    "log_policy_for_actions = tf.gather_nd(log_policy, indices)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Policy objective as in the last formula. Please use reduce_mean, not reduce_sum.\n",
-    "# You may use log_policy_for_actions to get log probabilities for actions taken.\n",
-    "# Also recall that we defined ph_cumulative_rewards earlier.\n",
-    "\n",
-    "J = <YOUR CODE>"
+    "def select_log_policy_for_actions(log_policy, actions):\n",
+    "    # This code selects the log-probabilities (log pi(a_i|s_i))\n",
+    "    # for those actions that were actually played.\n",
+    "    indices = tf.stack([tf.range(tf.shape(log_policy)[0]), actions], axis=-1)\n",
+    "    log_policy_for_actions = tf.gather_nd(log_policy, indices)\n",
+    "    return log_policy_for_actions"
    ]
   },
   {
@@ -326,10 +312,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Entropy regularization. If you don't add it, the policy will quickly deteriorate to\n",
-    "# being deterministic, harming exploration.\n",
-    "\n",
-    "entropy = <YOUR CODE: compute entropy. Do not forget the sign!>"
+    "optimizer = <YOUR CODE: select your favorite optimizer, set its hyperparameters>"
    ]
   },
   {
@@ -338,39 +321,29 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# # Maximizing X is the same as minimizing -X, hence the sign.\n",
-    "loss = -(J + 0.1 * entropy)\n",
-    "\n",
-    "update = tf.train.AdamOptimizer().minimize(loss)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def train_on_session(states, actions, rewards, t_max=1000):\n",
+    "def train_on_session(states, actions, rewards):\n",
     "    \"\"\"given full session, trains agent with policy gradient\"\"\"\n",
-    "    cumulative_rewards = get_cumulative_rewards(rewards)\n",
-    "    update.run({\n",
-    "        ph_states: states,\n",
-    "        ph_actions: actions,\n",
-    "        ph_cumulative_rewards: cumulative_rewards,\n",
-    "    })\n",
+    "    cumulative_returns = <YOUR CODE: get cumulative rewards>\n",
+    "\n",
+    "    states = tf.keras.backend.constant(states)\n",
+    "    cumulative_returns = tf.keras.backend.constant(cumulative_returns)\n",
+    "    actions = tf.keras.backend.constant(actions, dtype='int32')\n",
+    "\n",
+    "    with tf.GradientTape() as tape:\n",
+    "        logits = <YOUR CODE>\n",
+    "        policy = tf.nn.softmax(logits)\n",
+    "        log_policy = tf.nn.log_softmax(logits)\n",
+    "        log_policy_for_actions = <YOUR CODE>\n",
+    "\n",
+    "        J = <YOUR CODE>\n",
+    "        entropy = <YOUR CODE, mind the sign>\n",
+    "        loss = -(J + 0.1 * entropy)\n",
+    "        grads = tape.gradient(loss, model.trainable_variables)\n",
+    "    optimizer.apply_gradients(zip(grads, model.trainable_variables))\n",
+    "\n",
     "    return sum(rewards)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Initialize optimizer parameters\n",
-    "sess.run(tf.global_variables_initializer())"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -438,9 +411,22 @@
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
    "name": "python",
-   "pygments_lexer": "ipython3"
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
   }
  },
  "nbformat": 4,