Skip to content

Commit 77f1307

Browse files
authored
Merge pull request #2 from ML-KA/PPO
PPO
2 parents 5de9c73 + 3956c99 commit 77f1307

File tree

3 files changed

+79
-3
lines changed

3 files changed

+79
-3
lines changed

ReinforcementLearning.tex

+54-2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ \section{Bellman Equations}
77
\newcommand{\vfunc}[1]{v_\pi (#1)}
88
\newcommand{\policy}[2]{\pi (#1 | #2)}
99

10+
\todo{RL backup diagramms}
1011
TODO backup diagramms
1112

1213
\subsection{State Value Function}
@@ -47,7 +48,7 @@ \subsection{Optimal Action State Value Function recursive}
4748

4849
\section{Advantage Function}
4950

50-
TODO
51+
\todo{Advantage Function Definition}
5152

5253

5354
\section{Policy, Policy Gradient}
@@ -61,4 +62,55 @@ \subsection{Policy Gradient}
6162
\begin{align}
6263
\nabla_\theta \policy{s}{a} = \policy{s}{a} \nabla_\theta \log \policy{s}{a}
6364
\end{align}
64-
Note: this is valid for all probability distributions (the policy is a distribution over actions given states). The gradient term on the right hand side is called score function. The derivation basically uses the "log-trick".
65+
Note: this is valid for all probability distributions (the policy is a distribution over actions given states). The gradient term on the right hand side is called score function. The derivation basically uses the "log-trick".
66+
67+
68+
\subsection{Proximal Policy Optimization PPO}
69+
\newcommand{\oldpolicy}[2]{\pi_{\theta_{\mathrm{old}}} ( {#1} | {#2})}
70+
\begin{enumerate}
71+
\item \url{https://arxiv.org/abs/1707.06347}.
72+
\end{enumerate}
73+
Agent objective function:
74+
\begin{align}
75+
L^{\mathrm{CLIP}}( \theta)
76+
= \EE{
77+
\min(r_t(\theta) \hat{A_t}, \mathrm{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) \hat{A_t}
78+
}
79+
\end{align}
80+
Eventually they used many more objective terms, including an Entropy Term.
81+
Advantage $\hat{A_t}$ in this case means
82+
\begin{align}
83+
\hat{A_t} &= -V(s_t) + r_t + \gamma r_{t+1} + \dots + \gamma^{T-t}V(s_T)
84+
\end{align}
85+
If the advantage is positive, then the estimated value of the currente state $s_t$ is smaller then what you actually got
86+
as rewards a couple time steps later. Your estimated state value was too low then.
87+
If the advantage is negative, the estimate was too high.
88+
In a generalized form using temporal difference formulation.
89+
\begin{align}
90+
\hat{A_t} &= \delta_t + (\gamma \lambda)\delta_{t+1} + \dots + (\gamma\lambda)^{T-t+1}\delta_{T-1} \\
91+
\delta_t &= r_t + \gamma V(s_{t+1}) - V(s_t)
92+
\end{align}
93+
94+
\begin{algorithm}[H]
95+
\For{iteration=1,2,\dots}{
96+
\For{actor=1,2, \dots}{
97+
Run policy $\pi_{\theta_{\mathrm{old}}}$ in environment for $T$ timesteps\\
98+
Compute advantage estimates $\hat{A_1}, \dots, \hat{A_t}$
99+
}
100+
Optimize surrogate $L$ wrt $\theta$, with $K$ epochs and minibatch size $M \leq NT$ \\
101+
$\theta_{old} \leftarrow \theta$
102+
}
103+
\caption{PPO, Actor-Critic Style}
104+
\end{algorithm}
105+
106+
\iftoggle{questions}
107+
{
108+
\paragraph{Questions}
109+
\begin{enumerate}
110+
\item How does the clipping affect learning. We see higher variance towards end of training in the plots
111+
\item where does the objective come from (why devide the policy values)
112+
\item why conjugate gradients
113+
\item How does the clipping affect the gradients
114+
\end{enumerate}
115+
}{
116+
}

main.tex

+23
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,29 @@
77
%\usepackage{amsmath}
88
\usepackage{mlka_math}
99

10+
%\usepackage[english]{babel}
11+
%\usepackage[utf8x]{inputenc}
12+
%\usepackage[T1]{fontenc}
13+
%\usepackage{tabularx}
14+
\usepackage{mlka_math}
15+
16+
%\usepackage{amsthm}
17+
%% Useful packages
18+
19+
\usepackage[colorinlistoftodos]{todonotes}
20+
\usepackage[]{algorithm2e}
21+
\usepackage{etoolbox}
22+
23+
24+
25+
%\newtheorem{theorem}{Theorem}[section]
26+
%\newtheorem{corollary}{Corollary}[theorem]
27+
%\newtheorem{lemma}[theorem]{Lemma}
28+
29+
\newtoggle{questions}
30+
\toggletrue{questions}
31+
%\toggletrue{questions}
32+
1033
\begin{document}
1134

1235
% the front matter

mlka_math.sty

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,5 @@
1919
\newcommand{\frnorm}[1]{\pnorm{#1}{fr}}
2020

2121

22-
\newcommand{\deriv}[2]{\frac{\partial {#1}}{\partial {#2}}}
22+
\newcommand{\deriv}[2]{\frac{\partial {#1}}{\partial {#2}}}
23+
\newcommand{\EE}[1]{\mathrm{E}\!\left\{{#1}\right\}}

0 commit comments

Comments
 (0)