-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathEvolution-Strategies-2017.html
1448 lines (872 loc) · 51.4 KB
/
Evolution-Strategies-2017.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html class="theme-next mist use-motion" lang="zh-Hans">
<head><meta name="generator" content="Hexo 3.8.0">
<meta name="google-site-verification" content="zu-9nWphPjrzXV8v514mkHknIz4dNfHlib56-KNAu44">
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<meta name="theme-color" content="#222">
<script src="/lib/pace/pace.min.js?v=1.0.2"></script>
<link href="/lib/pace/pace-theme-flash.min.css?v=1.0.2" rel="stylesheet">
<meta http-equiv="Cache-Control" content="no-transform">
<meta http-equiv="Cache-Control" content="no-siteapp">
<script>
(function(i,s,o,g,r,a,m){i["DaoVoiceObject"]=r;i[r]=i[r]||function(){(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;a.charset="utf-8";m.parentNode.insertBefore(a,m)})(window,document,"script",('https:' == document.location.protocol ? 'https:' : 'http:') + "//widget.daovoice.io/widget/356f1943.js","daovoice")
daovoice('init', {
app_id: "356f1943"
});
daovoice('update');
</script>
<link href="/lib/fancybox/source/jquery.fancybox.css?v=2.1.5" rel="stylesheet" type="text/css">
<link href="/lib/font-awesome/css/font-awesome.min.css?v=4.6.2" rel="stylesheet" type="text/css">
<link href="/css/main.css?v=5.1.4" rel="stylesheet" type="text/css">
<link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon.png?v=5.1.4">
<link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32png?v=5.1.4">
<link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16.png?v=5.1.4">
<link rel="mask-icon" href="/images/logo.svg?v=5.1.4" color="#222">
<meta name="keywords" content="rl,">
<link rel="alternate" href="/atom.xml" title="Keavnn'Blog" type="application/atom+xml">
<script>
(function(){
if(''){
if (prompt('请输入文章密码','') !== ''){
alert('密码错误!');
history.back();
}
}
})();
</script>
<meta name="description" content="这一篇论文讲了强化学习算法的替代可解方案:进化策略。主要思想是对参数空间添加噪音而不是动作空间。 不推荐这篇论文: 公式没有详细推理,非常难懂 文中进化策略其实跟强化学习并没有特别大的关系 很多关于进化策略的性质、优势非常难懂,基本上都是文字解释,没有举例 文中措辞不难,但想要理解其本质非常难">
<meta name="keywords" content="rl">
<meta property="og:type" content="article">
<meta property="og:title" content="Evolution Strategies as a Scalable Alternative to Reinforcement Learning">
<meta property="og:url" content="http://StepNeverStop.github.io/Evolution-Strategies-2017.html">
<meta property="og:site_name" content="Keavnn'Blog">
<meta property="og:description" content="这一篇论文讲了强化学习算法的替代可解方案:进化策略。主要思想是对参数空间添加噪音而不是动作空间。 不推荐这篇论文: 公式没有详细推理,非常难懂 文中进化策略其实跟强化学习并没有特别大的关系 很多关于进化策略的性质、优势非常难懂,基本上都是文字解释,没有举例 文中措辞不难,但想要理解其本质非常难">
<meta property="og:locale" content="zh-Hans">
<meta property="og:image" content="http://stepneverstop.github.io/Evolution-Strategies-2017/algorithm1.png">
<meta property="og:image" content="http://stepneverstop.github.io/Evolution-Strategies-2017/algorithm2.png">
<meta property="og:image" content="http://stepneverstop.github.io/Evolution-Strategies-2017/mujoco.png">
<meta property="og:image" content="http://stepneverstop.github.io/Evolution-Strategies-2017/atari.png">
<meta property="og:image" content="http://stepneverstop.github.io/Evolution-Strategies-2017/parallelization.png">
<meta property="og:image" content="http://stepneverstop.github.io/Evolution-Strategies-2017/frame-skip.png">
<meta property="og:updated_time" content="2019-05-23T15:32:27.573Z">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="Evolution Strategies as a Scalable Alternative to Reinforcement Learning">
<meta name="twitter:description" content="这一篇论文讲了强化学习算法的替代可解方案:进化策略。主要思想是对参数空间添加噪音而不是动作空间。 不推荐这篇论文: 公式没有详细推理,非常难懂 文中进化策略其实跟强化学习并没有特别大的关系 很多关于进化策略的性质、优势非常难懂,基本上都是文字解释,没有举例 文中措辞不难,但想要理解其本质非常难">
<meta name="twitter:image" content="http://stepneverstop.github.io/Evolution-Strategies-2017/algorithm1.png">
<script type="text/javascript" id="hexo.configurations">
var NexT = window.NexT || {};
var CONFIG = {
root: '/',
scheme: 'Mist',
version: '5.1.4',
sidebar: {"position":"left","display":"post","offset":12,"b2t":false,"scrollpercent":true,"onmobile":true},
fancybox: true,
tabs: true,
motion: {"enable":true,"async":true,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},
duoshuo: {
userId: '0',
author: '博主'
},
algolia: {
applicationID: '',
apiKey: '',
indexName: '',
hits: {"per_page":10},
labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
}
};
</script>
<link rel="canonical" href="http://StepNeverStop.github.io/Evolution-Strategies-2017.html">
<title>Evolution Strategies as a Scalable Alternative to Reinforcement Learning | Keavnn'Blog</title>
</head>
<body itemscope="" itemtype="http://schema.org/WebPage" lang="zh-Hans">
<div class="container sidebar-position-left page-post-detail">
<div class="headband"></div>
<a href="https://github.com/StepNeverStop" class="github-corner" aria-label="View source on GitHub" rel="external nofollow" target="_blank"><svg width="80" height="80" viewbox="0 0 250 250" style="fill:#151513; color:#fff; position: absolute; top: 0; border: 0; right: 0;" aria-hidden="true"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"/><path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"/><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"/></svg></a><style>.github-corner:hover .octo-arm{animation:octocat-wave 560ms ease-in-out}@keyframes octocat-wave{0%,100%{transform:rotate(0)}20%,60%{transform:rotate(-25deg)}40%,80%{transform:rotate(10deg)}}@media (max-width:500px){.github-corner:hover .octo-arm{animation:none}.github-corner .octo-arm{animation:octocat-wave 560ms ease-in-out}}</style>
<header id="header" class="header" itemscope="" itemtype="http://schema.org/WPHeader">
<div class="header-inner"><div class="site-brand-wrapper">
<div class="site-meta ">
<div class="custom-logo-site-title">
<a href="/" class="brand" rel="start">
<span class="logo-line-before"><i></i></span>
<span class="site-title">Keavnn'Blog</span>
<span class="logo-line-after"><i></i></span>
</a>
</div>
<h1 class="site-subtitle" itemprop="description">If it is to be, it is up to me.</h1>
</div>
<div class="site-nav-toggle">
<button>
<span class="btn-bar"></span>
<span class="btn-bar"></span>
<span class="btn-bar"></span>
</button>
</div>
</div>
<nav class="site-nav">
<ul id="menu" class="menu">
<li class="menu-item menu-item-home">
<a href="/" rel="section">
<i class="menu-item-icon fa fa-fw fa-home"></i> <br>
首页
</a>
</li>
<li class="menu-item menu-item-about">
<a href="/about/" rel="section">
<i class="menu-item-icon fa fa-fw fa-user"></i> <br>
关于
</a>
</li>
<li class="menu-item menu-item-tags">
<a href="/tags/" rel="section">
<i class="menu-item-icon fa fa-fw fa-tags"></i> <br>
标签
</a>
</li>
<li class="menu-item menu-item-categories">
<a href="/categories/" rel="section">
<i class="menu-item-icon fa fa-fw fa-th"></i> <br>
分类
</a>
</li>
<li class="menu-item menu-item-archives">
<a href="/archives/" rel="section">
<i class="menu-item-icon fa fa-fw fa-archive"></i> <br>
归档
</a>
</li>
<li class="menu-item menu-item-search">
<a href="javascript:;" class="popup-trigger">
<i class="menu-item-icon fa fa-search fa-fw"></i> <br>
搜索
</a>
</li>
</ul>
<div class="site-search">
<div class="popup search-popup local-search-popup">
<div class="local-search-header clearfix">
<span class="search-icon">
<i class="fa fa-search"></i>
</span>
<span class="popup-btn-close">
<i class="fa fa-times-circle"></i>
</span>
<div class="local-search-input-wrapper">
<input autocomplete="off" placeholder="搜索..." spellcheck="false" type="text" id="local-search-input">
</div>
</div>
<div id="local-search-result"></div>
</div>
</div>
</nav>
</div>
</header>
<main id="main" class="main">
<div class="main-inner">
<div class="content-wrap">
<div id="content" class="content">
<div id="posts" class="posts-expand">
<article class="post post-type-normal" itemscope="" itemtype="http://schema.org/Article">
<div class="post-block">
<link itemprop="mainEntityOfPage" href="http://StepNeverStop.github.io/Evolution-Strategies-2017.html">
<span hidden itemprop="author" itemscope="" itemtype="http://schema.org/Person">
<meta itemprop="name" content="Keavnn">
<meta itemprop="description" content="">
<meta itemprop="image" content="/images/Kicon.jpg">
</span>
<span hidden itemprop="publisher" itemscope="" itemtype="http://schema.org/Organization">
<meta itemprop="name" content="Keavnn'Blog">
</span>
<header class="post-header">
<h2 class="post-title" itemprop="name headline">Evolution Strategies as a Scalable Alternative to Reinforcement Learning</h2>
<div class="post-meta">
<span class="post-time">
<span class="post-meta-item-icon">
<i class="fa fa-calendar-o"></i>
</span>
<span class="post-meta-item-text">发表于</span>
<time title="创建于" itemprop="dateCreated datePublished" datetime="2019-05-21T12:38:54+08:00">
2019-05-21
</time>
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-calendar-check-o"></i>
</span>
<span class="post-meta-item-text">更新于:</span>
<time title="更新于" itemprop="dateModified" datetime="2019-05-23T23:32:27+08:00">
2019-05-23
</time>
</span>
<span class="post-category">
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-folder-o"></i>
</span>
<span class="post-meta-item-text">分类于</span>
<span itemprop="about" itemscope="" itemtype="http://schema.org/Thing">
<a href="/categories/ReinforcementLearning/" itemprop="url" rel="index">
<span itemprop="name">ReinforcementLearning</span>
</a>
</span>
</span>
<div class="post-wordcount">
<span class="post-meta-item-icon">
<i class="fa fa-file-word-o"></i>
</span>
<span class="post-meta-item-text">字数统计:</span>
<span title="字数统计">
2.1k
</span>
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-clock-o"></i>
</span>
<span class="post-meta-item-text">阅读时长 ≈</span>
<span title="阅读时长">
8
</span>
</div>
</div>
</header>
<div class="post-body" itemprop="articleBody">
<p>这一篇论文讲了强化学习算法的替代可解方案:进化策略。主要思想是对参数空间添加噪音而不是动作空间。</p>
<p>不推荐这篇论文:</p>
<ul>
<li>公式没有详细推理,非常难懂</li>
<li>文中进化策略其实跟强化学习并没有特别大的关系</li>
<li>很多关于进化策略的性质、优势非常难懂,基本上都是文字解释,没有举例</li>
<li>文中措辞不难,但想要理解其本质非常难</li>
</ul>
<a id="more"></a>
<h1 id="简介"><a href="#简介" class="headerlink" title="简介"></a>简介</h1><p>论文地址:<a href="https://arxiv.org/pdf/1703.03864.pdf" rel="external nofollow" target="_blank">https://arxiv.org/pdf/1703.03864.pdf</a></p>
<p>进化策略ES是一组/一类算法,而不是一个算法,它属于黑盒优化方法,它由自然进化中的启发式搜索过程而得来:每一代中都有突变的基因,环境对基因突变的效果给出适应性的判断,重组好的突变基因产生下一代,直到最优。</p>
<p>进化策略算法的划分主要有三个依据:基因如何表示(神经网络参数)、突变如何产生(参数优化过程)、基因如何重组(参数重组)。</p>
<p>进化策略ES这种方法通常分为<a href="https://pdfs.semanticscholar.org/dd17/8d3f30d801922c98cec9c2d90db05395f244.pdf?_ga=2.257341323.183297583.1558416128-1251761365.1555224483" rel="external nofollow" target="_blank">直接策略搜索</a>和<a href="https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7307180&tag=1" rel="external nofollow" target="_blank">神经进化</a>,黑盒优化方法有很多很好的特性:</p>
<ol>
<li>不关心奖励分布,奖励密集或稀疏都无所谓</li>
<li>不需要反向传播梯度</li>
<li>tolerance of potentially arbitrarily long time horizons. 翻译为可以适应长期视野/回报,在长动作序列上有优势</li>
</ol>
<p>但是,进化策略ES往往不能解决像Q-Learning和PG这样可应用的难的强化学习问题,这篇论文旨在使进化策略可以解决DRL算法可解决的更难的问题。</p>
<h1 id="正文精要"><a href="#正文精要" class="headerlink" title="正文精要"></a>正文精要</h1><blockquote>
<p>A large source of difficulty in RL stems from the lack of informative gradients of policy performance: such gradients may not exist due to non-smoothness of the environment or policy, or may only be available as high-variance estimates because the environment usually can only be accessed via sampling. </p>
</blockquote>
<p>指出强化学习的难题在于缺乏策略性能的有效梯度:梯度可能由于环境不光滑而不存在、可能由于只能采样环境而存在高方差。</p>
<blockquote>
<p>For MDP-based reinforcement learning algorithms, on the other hand, it is well known that frameskip is a crucial parameter to get right for the optimization to succeed.</p>
</blockquote>
<p>对于基于MDP的强化学习算法,<strong>跳帧</strong>是算法优化的关键参数。</p>
<blockquote>
<p>It is common practice in RL to have the agent decide on its actions in a lower frequency than is used in the simulator that runs the environment.</p>
</blockquote>
<p>RL通常使智能体在模拟环境中决策频率高于在实际环境中。</p>
<hr>
<p>文中设定一个策略的期望奖励为:</p>
<script type="math/tex; mode=display">
\mathbb{E}_{\epsilon \sim N(0,I)}F(\theta+\sigma\epsilon)</script><p>关于网络参数$\theta$的导数为:</p>
<script type="math/tex; mode=display">
\nabla_{\theta}\mathbb{E}_{\epsilon \sim N(0,I)}F(\theta+\sigma\epsilon)=\frac{1}{\sigma}\mathbb{E}_{\epsilon \sim N(0,I)} \{F(\theta+\sigma\epsilon)\epsilon \}</script><p>其中,$\theta$为网络参数,也可以认为是多变量高斯分布的均值,$\sigma$为固定方差,$\epsilon$为扰动向量,由各向同性、方差均为1的多变量高斯分布采样得到。<strong>文中没有对该导数推导过程有介绍,好像是使用了Reinforce Trick的方法,但是却不知道具体如何推导出这个形式</strong>。</p>
<p>文中提到的算法1是对一个策略进行多次扰动,每扰动一次就与环境交互得到一个episode,最后只用各个扰动向量$\epsilon_{i}$与对应的回报$F(\theta)$相乘,根据该期望进行参数更新。</p>
<p><img src="./Evolution-Strategies-2017/algorithm1.png" alt=""></p>
<p>算法2是对算法1的并行化处理,设置相同的随机种子,假设n个worker:</p>
<ol>
<li>各个worker共用一个策略$\pi$</li>
<li>每个worker根据高斯分布采样得到扰动向量$\epsilon$</li>
<li>各个worker根据扰动后的策略参数采样一个episode</li>
<li>互相分发各自的回报</li>
<li><strong>再采样n个扰动向量$\epsilon$</strong>,使用梯度上升更新参数,然后分发策略</li>
</ol>
<p><img src="./Evolution-Strategies-2017/algorithm2.png" alt=""></p>
<p>文中后边提到,其实不必每次都从高斯分布中采样出扰动向量$\epsilon$,可以在开始训练前直接采样得到m个扰动向量,每次需要扰动向量时直接根据m的值生成一个随机数,取出以该随机数为下标的扰动向量即可。这么做可以减少更新时的时长消耗。</p>
<hr>
<blockquote>
<p>Experiments on Atari and MuJoCo show that it is a viable option with some attractive features: it is invariant to action frequency and delayed rewards, and it does not need temporal discounting or value function approximation. Most importantly, ES is highly parallelizable, which allows us to make up for a decreased data efficiency by scaling to more parallel workers. </p>
</blockquote>
<p>文中使用的进化策略ES的优点:</p>
<ul>
<li>与决策间隔无关,也就是对于跳帧间隔的设置鲁棒性很高</li>
<li>不关心延迟奖励</li>
<li>不需要折扣计算回报</li>
<li>不需要值函数近似</li>
<li>可以高度并行化使,我们能够通过扩展到更多并行训练节点来弥补数据效率的下降。</li>
</ul>
<h1 id="实验发现"><a href="#实验发现" class="headerlink" title="实验发现"></a>实验发现</h1><ol>
<li>使用<a href="https://arxiv.org/pdf/1606.03498.pdf" rel="external nofollow" target="_blank">Virtual Batch Normalization</a>和神经网络策略重参数(文中没有提到重参数的内容,只提到网络参数的影响)可以极大提升进化策略ES的可靠性。实验中,不使用这两种方法算法很“脆弱”,也就是不稳定。</li>
<li>进化策略ES可以高度并行化。通过引入一个基于通用随机数的新颖通讯策略,即是是1000个子节点也可以达到运行时间的线性加速。</li>
<li>进化策略ES的数据效率出奇的好。尽管相比A3C算法需要3-10倍的数据量,但是由于具有不需反向传播、没有值函数等特点,这些轻微的数据效率劣势可以被弥补。实验表明,相同计算量下,1小时ES与1天A3C的效果基本相同。</li>
<li>进化策略ES相比PG类算法的探索性更强。</li>
<li>进化策略ES的鲁棒性很好。多种不同训练环境可以使用同一组超参数。</li>
</ol>
<h1 id="实验结果"><a href="#实验结果" class="headerlink" title="实验结果"></a>实验结果</h1><h2 id="MuJoCo"><a href="#MuJoCo" class="headerlink" title="MuJoCo"></a>MuJoCo</h2><p>与<strong>高度优化</strong>的TRPO算法相比,ES在离散动作更有优势,因为连续动作在参数扰动方面可能过于平滑并且可能妨碍探索。</p>
<p>ES和TRPO的网络结构都是:输入层→64,tanh→64,tanh→输出层。</p>
<p>复杂环境如Hopper和Walker2d中,ES样本复杂性相比TRPO高不到10倍;简单场景中,相比低3倍。</p>
<p>TRPO训练500W步,ES训练至TRPO训练过程中各阶段效果所需步长的比例如表所示:</p>
<p><img src="./Evolution-Strategies-2017/mujoco.png" alt=""></p>
<p>虽然文中说是简单场景低三倍,其实根本就没有明确的低三倍,而且我对文中所提的简单场景复杂场景的划分也持怀疑态度。</p>
<h2 id="Atari"><a href="#Atari" class="headerlink" title="Atari"></a>Atari</h2><p>预处理、网络架构与Atari那篇论文的一模一样,用A3C使用3.2亿帧训练1天的结果与使用ES训练10亿帧的结果相同(保持计算量相同,因为ES不需要反向传播和值函数评估)。使用720块cpu,训练一个游戏只需1小时。</p>
<p>最终,纯图像输入下,与A3C相比,23个游戏ES胜,28个游戏A3C胜。</p>
<p><img src="./Evolution-Strategies-2017/atari.png" alt=""></p>
<h2 id="并行化-Parallelization"><a href="#并行化-Parallelization" class="headerlink" title="并行化 Parallelization"></a>并行化 Parallelization</h2><p>ES特别适合并行化,因为其通讯低带宽特性(只需各个worker的回报和随机种子)。</p>
<p>测试环境:3D Humanoid walking task</p>
<p>结果:单机18核需11小时,与最先进的强化学习算法性能相当,80台机器1440个CPU核心只需10分钟。</p>
<p><img src="./Evolution-Strategies-2017/parallelization.png" alt=""></p>
<p>随着核心数增加,训练性能线性加速。</p>
<h2 id="“跳帧”测试"><a href="#“跳帧”测试" class="headerlink" title="“跳帧”测试"></a>“跳帧”测试</h2><p>将强化学习在模拟环境中训练出的模型用于实际环境中式,通常需要降低其决策频率,也就是加大决策间隔。</p>
<p>如果跳帧设置过大,智能体所做的动作往往不够好,如果跳帧设置过小,会导致每个episode的步数过长,加大计算量,恶化训练过程(其实文中这么说并不严谨)。</p>
<p>ES的一个优势是梯度计算与回合长度无关,这间接增加了对跳帧间隔的鲁棒性。在Atari游戏Pong中使用四个不同跳帧间隔{1,2,3,4}的学习曲线如下:</p>
<p><img src="./Evolution-Strategies-2017/frame-skip.png" alt=""></p>
<p>由曲线可以看出,不同的跳帧间隔,训练效果差不多。<strong>但,我对该鲁棒性测试在复杂环境中的效果表示怀疑。我觉得前沿强化学习算法在该训练场景中使用不同的跳帧间隔也可以得到相同结果。</strong></p>
</div>
<div>
<div>
<div style="text-align:center;color: #ccc;font-size:14px;">-------------本文结束<i class="fa fa-heart"></i>感谢您的阅读-------------</div>
</div>
</div>
<div>
<div class="my_post_copyright">
<script src="//cdn.bootcss.com/clipboard.js/1.5.10/clipboard.min.js"></script>
<!-- JS库 sweetalert 可修改路径 -->
<script src="https://cdn.bootcss.com/jquery/2.0.0/jquery.min.js"></script>
<script src="https://unpkg.com/sweetalert/dist/sweetalert.min.js"></script>
<p><span>本文标题:</span><a href="/Evolution-Strategies-2017.html">Evolution Strategies as a Scalable Alternative to Reinforcement Learning</a></p>
<p><span>文章作者:</span><a href="/" title="访问 Keavnn 的个人博客">Keavnn</a></p>
<p><span>发布时间:</span>2019年05月21日 - 12:05</p>
<p><span>最后更新:</span>2019年05月23日 - 23:05</p>
<p><span>原始链接:</span><a href="/Evolution-Strategies-2017.html" title="Evolution Strategies as a Scalable Alternative to Reinforcement Learning">http://StepNeverStop.github.io/Evolution-Strategies-2017.html</a>
<span class="copy-path" title="点击复制文章链接"><i class="fa fa-clipboard" data-clipboard-text="http://StepNeverStop.github.io/Evolution-Strategies-2017.html" aria-label="复制成功!"></i></span>
</p>
<p><span>许可协议:</span><i class="fa fa-creative-commons"></i> <a rel="external nofollow" href="https://creativecommons.org/licenses/by-nc-sa/4.0/" target="_blank" title="Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0)">署名-非商业性使用-相同方式共享 4.0 国际</a> 转载请保留原文链接及作者。</p>
</div>
<script>
var clipboard = new Clipboard('.fa-clipboard');
$(".fa-clipboard").click(function(){
clipboard.on('success', function(){
swal({
title: "",
text: '复制成功',
icon: "success",
showConfirmButton: true
});
});
});
</script>
</div>
<div>
<div style="padding: 10px 0; margin: 20px auto; width: 90%; text-align: center;">
<div>如果您获得了帮助,也可以资助一下小的啦~</div>
<button id="rewardButton" disable="enable" onclick="var qr = document.getElementById('QR'); if (qr.style.display === 'none') {qr.style.display='block';} else {qr.style.display='none'}">
<span>打赏啦</span>
</button>
<div id="QR" style="display: none;">
<div id="wechat" style="display: inline-block">
<img id="wechat_qr" src="/images/wechatpay.jpg" alt="Keavnn 微信">
<p>微信</p>
</div>
<div id="alipay" style="display: inline-block">
<img id="alipay_qr" src="/images/alipay.jpg" alt="Keavnn 支付宝">
<p>支付宝</p>
</div>
</div>
</div>
</div>
<footer class="post-footer">
<div class="post-tags">
<a href="/tags/rl/" rel="tag"> <i class="fa fa-tag"></i> rl</a>
</div>
<div class="post-nav">
<div class="post-nav-next post-nav-item">
<a href="/sarsa-and-q-learning.html" rel="next" title="SARSA and Q-Learning">
<i class="fa fa-chevron-left"></i> SARSA and Q-Learning
</a>
</div>
<span class="post-nav-divider"></span>
<div class="post-nav-prev post-nav-item">
<a href="/Prioritized-Experience-Replay.html" rel="prev" title="Prioritized Experience Replay">
Prioritized Experience Replay <i class="fa fa-chevron-right"></i>
</a>
</div>
</div>
</footer>
</div>
</article>
<div class="post-spread">
<!-- Go to www.addthis.com/dashboard to customize your tools -->
<div class="addthis_inline_share_toolbox">
<script type="text/javascript" src="//s7.addthis.com/js/300/addthis_widget.js#pubid=ra-5cefbfc88c13b0e7" async="async"></script>
</div>
</div>
</div>
</div>
<div class="comments" id="comments">
<div id="lv-container" data-id="city" data-uid="MTAyMC80MTk0NS8xODQ5MQ=="></div>
</div>
</div>
<div class="sidebar-toggle">
<div class="sidebar-toggle-line-wrap">
<span class="sidebar-toggle-line sidebar-toggle-line-first"></span>
<span class="sidebar-toggle-line sidebar-toggle-line-middle"></span>
<span class="sidebar-toggle-line sidebar-toggle-line-last"></span>
</div>
</div>
<aside id="sidebar" class="sidebar">
<div id="sidebar-dimmer"></div>
<div class="sidebar-inner">
<ul class="sidebar-nav motion-element">
<li class="sidebar-nav-toc sidebar-nav-active" data-target="post-toc-wrap">
文章目录
</li>
<li class="sidebar-nav-overview" data-target="site-overview-wrap">
站点概览
</li>
</ul>
<section class="site-overview-wrap sidebar-panel">
<div class="site-overview">
<div class="site-author motion-element" itemprop="author" itemscope="" itemtype="http://schema.org/Person">
<img class="site-author-image" itemprop="image" src="/images/Kicon.jpg" alt="Keavnn">
<p class="site-author-name" itemprop="name">Keavnn</p>
<p class="site-description motion-element" itemprop="description">If it is to be, it is up to me.</p>
</div>
<nav class="site-state motion-element">
<div class="site-state-item site-state-posts">
<a href="/archives/">
<span class="site-state-item-count">51</span>
<span class="site-state-item-name">日志</span>
</a>
</div>
<div class="site-state-item site-state-categories">
<a href="/categories/index.html">
<span class="site-state-item-count">11</span>
<span class="site-state-item-name">分类</span>
</a>
</div>
<div class="site-state-item site-state-tags">
<a href="/tags/index.html">
<span class="site-state-item-count">26</span>
<span class="site-state-item-name">标签</span>
</a>
</div>
</nav>
<div class="feed-link motion-element">
<a href="/atom.xml" rel="alternate">
<i class="fa fa-rss"></i>
RSS
</a>
</div>
<div class="links-of-author motion-element">
<span class="links-of-author-item">
<a href="https://github.com/StepNeverStop" target="_blank" title="GitHub" rel="external nofollow">
<i class="fa fa-fw fa-github"></i>GitHub</a>
</span>
<span class="links-of-author-item">
<a href="mailto:[email protected]" target="_blank" title="E-Mail" rel="external nofollow">
<i class="fa fa-fw fa-envelope"></i>E-Mail</a>
</span>
</div>
<div class="cc-license motion-element" itemprop="license">
<a href="https://creativecommons.org/licenses/by-nc-sa/4.0/" class="cc-opacity" target="_blank" rel="external nofollow">
<img src="/images/cc-by-nc-sa.svg" alt="Creative Commons">
</a>
</div>
<div class="links-of-blogroll motion-element links-of-blogroll-inline">
<div class="links-of-blogroll-title">
<i class="fa fa-fw fa-link"></i>
推荐阅读
</div>
<ul class="links-of-blogroll-list">
<li class="links-of-blogroll-item">
<a href="https://bluefisher.github.io" title="Fisher Chang" target="_blank" rel="external nofollow">Fisher Chang</a>
</li>
</ul>
</div>
</div>
</section>
<!--noindex-->
<section class="post-toc-wrap motion-element sidebar-panel sidebar-panel-active">
<div class="post-toc">
<div class="post-toc-content"><ol class="nav"><li class="nav-item nav-level-1"><a class="nav-link" href="#简介"><span class="nav-number">1.</span> <span class="nav-text">简介</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#正文精要"><span class="nav-number">2.</span> <span class="nav-text">正文精要</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#实验发现"><span class="nav-number">3.</span> <span class="nav-text">实验发现</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#实验结果"><span class="nav-number">4.</span> <span class="nav-text">实验结果</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#MuJoCo"><span class="nav-number">4.1.</span> <span class="nav-text">MuJoCo</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#Atari"><span class="nav-number">4.2.</span> <span class="nav-text">Atari</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#并行化-Parallelization"><span class="nav-number">4.3.</span> <span class="nav-text">并行化 Parallelization</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#“跳帧”测试"><span class="nav-number">4.4.</span> <span class="nav-text">“跳帧”测试</span></a></li></ol></li></ol></div>
</div>
</section>
<!--/noindex-->
</div>
</aside>
</div>
</main>
<footer id="footer" class="footer">
<div class="footer-inner">
<script async src="https://busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script>
<div class="copyright">© <span itemprop="copyrightYear">2020</span>
<span class="with-love">
<i class="fa fa-heart"></i>
</span>
<span class="author" itemprop="copyrightHolder">Keavnn</span>
<span class="post-meta-divider">|</span>
<span class="post-meta-item-icon">
<i class="fa fa-area-chart"></i>
</span>
<span class="post-meta-item-text">Site words total count:</span>
<span title="Site words total count">80.3k</span>
</div>
<div class="powered-by">
<i class="fa fa-user-md"></i><span id="busuanzi_container_site_pv">
本站总访问量<span id="busuanzi_value_site_pv"></span>次
</span>
</div>
<!-- <div class="theme-info">
<div class="powered-by"></div>
<span class="post-count">博客全站共80.3k字</span>
</div> -->
</div>
</footer>
<div class="back-to-top">
<i class="fa fa-arrow-up"></i>
<span id="scrollpercent"><span>0</span>%</span>
</div>
</div>
<script type="text/javascript">
if (Object.prototype.toString.call(window.Promise) !== '[object Function]') {
window.Promise = null;
}
</script>
<script type="text/javascript" src="/lib/jquery/index.js?v=2.1.3"></script>
<script type="text/javascript" src="/lib/fastclick/lib/fastclick.min.js?v=1.0.6"></script>
<script type="text/javascript" src="/lib/jquery_lazyload/jquery.lazyload.js?v=1.9.7"></script>
<script type="text/javascript" src="/lib/velocity/velocity.min.js?v=1.2.1"></script>
<script type="text/javascript" src="/lib/velocity/velocity.ui.min.js?v=1.2.1"></script>
<script type="text/javascript" src="/lib/fancybox/source/jquery.fancybox.pack.js?v=2.1.5"></script>