Merge pull request #2 from ML-KA/PPO

LeanderK · web-flow · commit 77f13079a862 · 2018-10-06T15:12:23.000+02:00
PPO
diff --git a/ReinforcementLearning.tex b/ReinforcementLearning.tex
@@ -7,6 +7,7 @@ \section{Bellman Equations}
 \newcommand{\vfunc}[1]{v_\pi (#1)}
 \newcommand{\policy}[2]{\pi (#1 | #2)}
 
+\todo{RL backup diagramms}
 TODO backup diagramms
 
 \subsection{State Value Function}
@@ -47,7 +48,7 @@ \subsection{Optimal Action State Value Function recursive}
 
 \section{Advantage Function}
 
-TODO
+\todo{Advantage Function Definition}
 
 
 \section{Policy, Policy Gradient}
@@ -61,4 +62,55 @@ \subsection{Policy Gradient}
 \begin{align}
 	\nabla_\theta \policy{s}{a} = \policy{s}{a} \nabla_\theta \log \policy{s}{a}
 \end{align}
-Note: this is valid for all probability distributions (the policy is a distribution over actions given states). The gradient term on the right hand side is called score function. The derivation basically uses the "log-trick".
+Note: this is valid for all probability distributions (the policy is a distribution over actions given states). The gradient term on the right hand side is called score function. The derivation basically uses the "log-trick".
+
+
+\subsection{Proximal Policy Optimization PPO}
+\newcommand{\oldpolicy}[2]{\pi_{\theta_{\mathrm{old}}} ( {#1} | {#2})}
+\begin{enumerate}
+	\item \url{https://arxiv.org/abs/1707.06347}. 
+\end{enumerate}
+Agent objective function:
+\begin{align}
+	L^{\mathrm{CLIP}}( \theta) 
+	= \EE{
+		\min(r_t(\theta) \hat{A_t}, \mathrm{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) \hat{A_t}
+	}
+\end{align}
+Eventually they used many more objective terms, including an Entropy Term.
+Advantage $\hat{A_t}$ in this case means
+\begin{align}
+	\hat{A_t} &= -V(s_t) + r_t + \gamma r_{t+1} + \dots + \gamma^{T-t}V(s_T)
+\end{align}
+If the advantage is positive, then the estimated value of the currente state $s_t$ is smaller then what you actually got
+as rewards a couple time steps later. Your estimated state value was too low then.
+If the advantage is negative, the estimate was too high.
+In a generalized form using temporal difference formulation.
+\begin{align}
+	\hat{A_t} &= \delta_t + (\gamma \lambda)\delta_{t+1} + \dots + (\gamma\lambda)^{T-t+1}\delta_{T-1} \\
+	\delta_t &= r_t + \gamma V(s_{t+1}) - V(s_t)
+\end{align}
+
+\begin{algorithm}[H]
+	\For{iteration=1,2,\dots}{
+		\For{actor=1,2, \dots}{
+			Run policy $\pi_{\theta_{\mathrm{old}}}$ in environment for $T$ timesteps\\
+			Compute advantage estimates $\hat{A_1}, \dots, \hat{A_t}$
+		}
+		Optimize surrogate $L$ wrt $\theta$, with $K$ epochs and minibatch size $M \leq NT$ \\
+		$\theta_{old} \leftarrow \theta$
+	}
+	\caption{PPO, Actor-Critic Style}
+\end{algorithm}
+
+\iftoggle{questions}
+{
+	\paragraph{Questions}
+	\begin{enumerate}
+		\item How does the clipping affect learning. We see higher variance towards end of training in the plots
+		\item where does the objective come from (why devide the policy values)
+		\item why conjugate gradients
+		\item How does the clipping affect the gradients
+	\end{enumerate}
+}{
+}
diff --git a/main.tex b/main.tex
@@ -7,6 +7,29 @@
 %\usepackage{amsmath}
 \usepackage{mlka_math}
 
+%\usepackage[english]{babel}
+%\usepackage[utf8x]{inputenc}
+%\usepackage[T1]{fontenc}
+%\usepackage{tabularx}
+\usepackage{mlka_math}
+
+%\usepackage{amsthm}
+%% Useful packages
+
+\usepackage[colorinlistoftodos]{todonotes}
+\usepackage[]{algorithm2e}
+\usepackage{etoolbox}
+
+
+
+%\newtheorem{theorem}{Theorem}[section]
+%\newtheorem{corollary}{Corollary}[theorem]
+%\newtheorem{lemma}[theorem]{Lemma}
+
+\newtoggle{questions}
+\toggletrue{questions}
+%\toggletrue{questions}
+
 \begin{document}
 
 % the front matter
diff --git a/mlka_math.sty b/mlka_math.sty
@@ -19,4 +19,5 @@
 \newcommand{\frnorm}[1]{\pnorm{#1}{fr}}
 
 
-\newcommand{\deriv}[2]{\frac{\partial {#1}}{\partial {#2}}}
+\newcommand{\deriv}[2]{\frac{\partial {#1}}{\partial {#2}}}
+\newcommand{\EE}[1]{\mathrm{E}\!\left\{{#1}\right\}}