@@ -7,6 +7,7 @@ \section{Bellman Equations}
7
7
\newcommand {\vfunc }[1]{v_\pi (#1)}
8
8
\newcommand {\policy }[2]{\pi (#1 | #2)}
9
9
10
+ \todo {RL backup diagramms}
10
11
TODO backup diagramms
11
12
12
13
\subsection {State Value Function }
@@ -47,7 +48,7 @@ \subsection{Optimal Action State Value Function recursive}
47
48
48
49
\section {Advantage Function }
49
50
50
- TODO
51
+ \todo {Advantage Function Definition}
51
52
52
53
53
54
\section {Policy, Policy Gradient }
@@ -61,4 +62,55 @@ \subsection{Policy Gradient}
61
62
\begin {align }
62
63
\nabla _\theta \policy {s}{a} = \policy {s}{a} \nabla _\theta \log \policy {s}{a}
63
64
\end {align }
64
- Note: this is valid for all probability distributions (the policy is a distribution over actions given states). The gradient term on the right hand side is called score function. The derivation basically uses the "log-trick" .
65
+ Note: this is valid for all probability distributions (the policy is a distribution over actions given states). The gradient term on the right hand side is called score function. The derivation basically uses the "log-trick" .
66
+
67
+
68
+ \subsection {Proximal Policy Optimization PPO }
69
+ \newcommand {\oldpolicy }[2]{\pi _{\theta _{\mathrm {old}}} ( {#1} | {#2})}
70
+ \begin {enumerate }
71
+ \item \url {https://arxiv.org/abs/1707.06347}.
72
+ \end {enumerate }
73
+ Agent objective function:
74
+ \begin {align }
75
+ L^{\mathrm {CLIP}}( \theta )
76
+ = \EE {
77
+ \min (r_t(\theta ) \hat {A_t}, \mathrm {clip}(r_t(\theta ), 1-\epsilon , 1+\epsilon ) \hat {A_t}
78
+ }
79
+ \end {align }
80
+ Eventually they used many more objective terms, including an Entropy Term.
81
+ Advantage $ \hat {A_t}$ in this case means
82
+ \begin {align }
83
+ \hat {A_t} &= -V(s_t) + r_t + \gamma r_{t+1} + \dots + \gamma ^{T-t}V(s_T)
84
+ \end {align }
85
+ If the advantage is positive, then the estimated value of the currente state $ s_t$ is smaller then what you actually got
86
+ as rewards a couple time steps later. Your estimated state value was too low then.
87
+ If the advantage is negative, the estimate was too high.
88
+ In a generalized form using temporal difference formulation.
89
+ \begin {align }
90
+ \hat {A_t} &= \delta _t + (\gamma \lambda )\delta _{t+1} + \dots + (\gamma\lambda )^{T-t+1}\delta _{T-1} \\
91
+ \delta _t &= r_t + \gamma V(s_{t+1}) - V(s_t)
92
+ \end {align }
93
+
94
+ \begin {algorithm }[H]
95
+ \For {iteration=1,2,\dots }{
96
+ \For {actor=1,2, \dots }{
97
+ Run policy $ \pi _{\theta _{\mathrm {old}}}$ in environment for $ T$ timesteps\\
98
+ Compute advantage estimates $ \hat {A_1}, \dots , \hat {A_t}$
99
+ }
100
+ Optimize surrogate $ L$ wrt $ \theta $ , with $ K$ epochs and minibatch size $ M \leq NT$ \\
101
+ $ \theta _{old} \leftarrow \theta $
102
+ }
103
+ \caption {PPO, Actor-Critic Style}
104
+ \end {algorithm }
105
+
106
+ \iftoggle {questions}
107
+ {
108
+ \paragraph {Questions }
109
+ \begin {enumerate }
110
+ \item How does the clipping affect learning. We see higher variance towards end of training in the plots
111
+ \item where does the objective come from (why devide the policy values)
112
+ \item why conjugate gradients
113
+ \item How does the clipping affect the gradients
114
+ \end {enumerate }
115
+ }{
116
+ }
0 commit comments