-
Notifications
You must be signed in to change notification settings - Fork 0
/
iioos.tex
1138 lines (976 loc) · 74.2 KB
/
iioos.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
%This is "aamas2015_sample.tex", a revised version of aamas2014_sample.tex
% This file should be compiled with "aamas2015.cls"
% This example file demonstrates the use of the 'aamas2015.cls'
% LaTeX2e document class file. It is intended for those submitting
% articles to the AAMAS-2015 conference. This file is based on
% the sig-alternate.tex example file.
% The 'sig-alternate.cls' file of ACM will produce a similar-looking,
% albeit, 'tighter' paper resulting in, invariably, fewer pages
% than the original ACM style.
%
% ----------------------------------------------------------------------------------------------------------------
% This .tex file (and associated .cls ) produces:
% 1) The Permission Statement
% 2) The Conference (location) Info information
% 3) The Copyright Line with AAMAS data
% 4) NO page numbers
%
% as against the acm_proc_article-sp.cls file which
% DOES NOT produce 1) through 3) above.
%
% Using 'aamas2015.cls' you don't have control
% from within the source .tex file, over both the CopyrightYear
% (defaulted to 20XX) and the IFAAMAS Copyright Data
% (defaulted to X-XXXXX-XX-X/XX/XX).
% These information will be overwritten by fixed AAMAS 2015 information
% in the style files - it is NOT as you are used to with ACM style files.
%
% ---------------------------------------------------------------------------------------------------------------
% This .tex source is an example which *does* use
% the .bib file (from which the .bbl file is produced).
% REMEMBER HOWEVER: After having produced the .bbl file,
% and prior to final submission, you *NEED* to 'insert'
% your .bbl file into your source .tex file so as to provide
% ONE 'self-contained' source file.
%
\documentclass{aamas2015}
% if you are using PDF LaTeX and you cannot find a way for producing
% letter, the following explicit settings may help
\usepackage{pdfsync}
\usepackage{comment}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsbsy}
%\usepackage{amsthm}
\usepackage{calc}
\usepackage{color}
\usepackage{url}
\usepackage{xspace}
%\usepackage[caption=false]{subfig}
\usepackage{subfigure}
\usepackage{graphicx}
\usepackage{xcolor}
\usepackage{comment}
\usepackage{xspace}
\usepackage{tikz}
\usepackage{epsdice}
\usetikzlibrary{trees}
\usetikzlibrary{shapes.geometric}
\tikzset{
treenode/.style = {align=center, inner sep=0pt, font=\sffamily},
ma/.style = {draw,treenode, shape border rotate=90, isosceles triangle,isosceles triangle apex angle=60, black, minimum width=2mm},% arbre rouge noir, noeud noir
mi/.style = {ma, shape border rotate=-90},
ch/.style = {draw, treenode, circle, minimum width=2mm, black}
}
\tikzstyle{level 1}=[level distance=8mm, sibling distance=1.5cm]
\tikzstyle{level 2}=[level distance=8mm, sibling distance=1cm]
\tikzstyle{level 3}=[level distance=8mm, sibling distance=0.5cm]
\usepackage[algo2e, noend, noline, linesnumbered]{algorithm2e}
\providecommand{\SetAlgoLined}{\SetLine}
\providecommand{\DontPrintSemicolon}{\dontprintsemicolon}
\DontPrintSemicolon
\makeatletter
\newcommand{\pushline}{\Indp}% Indent
\newcommand{\popline}{\Indm}
\makeatother
% the note center!
\definecolor{darkgreen}{RGB}{0,125,0}
\newcounter{vlNoteCounter}
\newcounter{mlNoteCounter}
\newcounter{mbNoteCounter}
\newcommand{\vlnote}[1]{{\scriptsize \color{blue} $\blacksquare$ \refstepcounter{vlNoteCounter}\textsf{[VL]$_{\arabic{vlNoteCounter}}$:{#1}}}}
\newcommand{\mlnote}[1]{{\scriptsize \color{darkgreen} $\blacksquare$ \refstepcounter{mlNoteCounter}\textsf{[ML]$_{\arabic{mlNoteCounter}}$:{#1}}}}
\newcommand{\asnote}[1]{{\scriptsize \color{red} $\blacktriangle$ \refstepcounter{mbNoteCounter}\textsf{[MB]$_{\arabic{asNoteCounter}}$:{#1}}}}
\renewcommand{\vlnote}[1]{}
\renewcommand{\mlnote}[1]{}
\renewcommand{\asnote}[1]{}
\newcommand{\argmin}{\operatornamewithlimits{argmin}}
\newcommand{\argmax}{\operatornamewithlimits{argmax}}
\newcommand{\bE}{\mathbb{E}}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\bg}{\mathbf{g}}
\newcommand{\bu}{\mathbf{u}}
\newcommand{\bU}{\mathbf{U}}
\newcommand{\cI}{\mathcal{I}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\tta}{\mathtt{a}}
\newcommand{\ttm}{\mathtt{m}}
\newcommand{\PW}{\mbox{PW}}
\newcommand{\BR}{\mbox{BR}}
\newcommand{\defword}[1]{\textbf{\boldmath{#1}}}
\newcommand{\ie}{{\it i.e.}~}
\newcommand{\eg}{{\it e.g.}}
\newtheorem{definition}{Definition}
\newtheorem{fact}{Fact}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}{Lemma}
\newtheorem{proposition}{Proposition}
\newcommand{\Proof}{{\noindent\bf Proof. }}
\newcommand{\citejustyear}[1]{\cite{#1}}
\newcommand{\Qed}{$\blacksquare$}
\newcommand{\abs}[1]{\left|#1\right|}
\newcommand{\todo}[1]{{\color{red}{\bf #1}}}
\newcommand{\breturn}{{\bf return}\xspace}
\pdfpagewidth=8.5truein
\pdfpageheight=11truein
\begin{document}
% In the original styles from ACM, you would have needed to
% add meta-info here. This is not necessary for AAMAS 2015 as
% the complete copyright information is generated by the cls-files.
\title{Online Monte Carlo Counterfactual Regret Minimization\\ for Search in Imperfect Information Games}
% AUTHORS
% For initial submission, do not give author names, but the
% tracking number, instead, as the review process is blind.
% You need the command \numberofauthors to handle the 'placement
% and alignment' of the authors beneath the title.
%
% For aesthetic reasons, we recommend 'three authors at a time'
% i.e. three 'name/affiliation blocks' be placed beneath the title.
%
% NOTE: You are NOT restricted in how many 'rows' of
% "name/affiliations" may appear. We just ask that you restrict
% the number of 'columns' to three.
%
% Because of the available 'opening page real-estate'
% we ask you to refrain from putting more than six authors
% (two rows with three columns) beneath the article title.
% More than six makes the first-page appear very cluttered indeed.
%
% Use the \alignauthor commands to handle the names
% and affiliations for an 'aesthetic maximum' of six authors.
% Add names, affiliations, addresses for
% the seventh etc. author(s) as the argument for the
% \additionalauthors command.
% These 'additional authors' will be output/set for you
% without further effort on your part as the last section in
% the body of your article BEFORE References or any Appendices.
%\numberofauthors{8} % in this sample file, there are a *total*
% of EIGHT authors. SIX appear on the 'first-page' (for formatting
% reasons) and the remaining two appear in the \additionalauthors section.
%
\numberofauthors{3}
\author{
% You can go ahead and credit any number of authors here,
% e.g. one 'row of three' or two rows (consisting of one row of three
% and a second row of one, two or three).
%
% The command \alignauthor (no curly braces needed) should
% precede each author name, affiliation/snail-mail address and
% e-mail address. Additionally, tag each line of
% affiliation/address with \affaddr, and tag the
% e-mail address with \email.
% 1st. author
\alignauthor
Viliam Lis\'{y}\\
\affaddr{Dept. Computer Science, FEE}\\
\affaddr{Czech Technical University}\\
\affaddr{in Prague, Czech Republic}\\
\email{[email protected]}
% 2nd. author
\alignauthor
Marc Lanctot\titlenote{This author has a new affiliation: Google DeepMind, London, United Kingdom.}\\
\affaddr{Dept. Knowledge Engineering}\\
\affaddr{Maastricht University}\\
\affaddr{The Netherlands}\\
\email{[email protected]}
% 3rd. author
\alignauthor
Michael Bowling\\
\affaddr{Dept. Computing Science}\\
\affaddr{University of Alberta, Edmonton, Canada}\\
% \affaddr{Hekla, Iceland}\\
\email{[email protected]}
}
%\and % use '\and' if you need 'another row' of author names
% 4th. author
%\alignauthor Lawrence P. Leipuner\\
% \affaddr{Brookhaven Laboratories}\\
% \affaddr{Brookhaven National Lab}\\
% \affaddr{P.O. Box 5000}\\
% \email{[email protected]}
% 5th. author
%\alignauthor Sean Fogarty\\
% \affaddr{NASA Ames Research Center}\\
% \affaddr{Moffett Field}\\
% \affaddr{California 94035}\\
% \email{[email protected]}
% 6th. author
%\alignauthor Charles Palmer\\
% \affaddr{Palmer Research Laboratories}\\
% \affaddr{8600 Datapoint Drive}\\
% \affaddr{San Antonio, Texas 78229}\\
% \email{[email protected]}
%\and
%% 7th. author
%\alignauthor Lawrence P. Leipuner\\
% \affaddr{Brookhaven Laboratories}\\
% \affaddr{Brookhaven National Lab}\\
% \affaddr{P.O. Box 5000}\\
% \email{[email protected]}
%% 8th. author
%\alignauthor Sean Fogarty\\
% \affaddr{NASA Ames Research Center}\\
% \affaddr{Moffett Field}\\
% \affaddr{California 94035}\\
% \email{[email protected]}
%% 9th. author
%\alignauthor Charles Palmer\\
% \affaddr{Palmer Research Laboratories}\\
% \affaddr{8600 Datapoint Drive}\\
% \affaddr{San Antonio, Texas 78229}\\
% \email{[email protected]}
%}
%% There's nothing stopping you putting the seventh, eighth, etc.
%% author on the opening page (as the 'third row') but we ask,
%% for aesthetic reasons that you place these 'additional authors'
%% in the \additional authors block, viz.
%\additionalauthors{Additional authors: John Smith (The Th{\o}rv{\"a}ld Group,
%email: {\texttt{[email protected]}}) and Julius P.~Kumquat
%(The Kumquat Consortium, email: {\texttt{[email protected]}}).}
%\date{30 July 1999}
%% Just remember to make sure that the TOTAL number of authors
%% is the number that will appear on the first page PLUS the
%% number that will appear in the \additionalauthors section.
\maketitle
\begin{abstract}
Online search in games has been a core interest of artificial intelligence.
%Advances made in search for perfect information games
%have led to AI capable of defeating the world's top human experts.
Search in imperfect information games (\eg, Poker, Bridge, Skat) is particularly challenging due to
the complexities introduced by hidden information.
In this paper, we present Online Outcome Sampling,
an online search variant of Monte Carlo Counterfactual Regret Minimization, which preserves its convergence to Nash equilibrium.
We show that OOS can overcome the problem of non-locality encountered by previous search algorithms and perform well against its worst-case opponents.
We show that exploitability of the strategies played by OOS decreases as
the amount of search time increases, and that preexisting Information Set Monte Carlo tree search (ISMCTS) can get more exploitable over time.
In head-to-head play, OOS outperforms ISMCTS in games where non-locality plays a significant role, given a sufficient computation time per move.
\end{abstract}
% Note that the category section should be completed after reference to the ACM Computing Classification Scheme available at
% http://www.acm.org/about/class/1998/.
\category{I.2.1}{Artificial Intelligence}{Applications and Expert Systems}[Games]
%\category{H.4}{Information Systems Applications}{Miscellaneous}
%A category including the fourth, optional field follows...
%\category{D.2.8}{Software Engineering}{Metrics}[complexity measures, performance measures]
%General terms should be selected from the following 16 terms: Algorithms, Management, Measurement, Documentation, Performance, Design, Economics, Reliability, Experimentation, Security, Human Factors, Standardization, Languages, Theory, Legal Aspects, Verification.
\terms{Algorithms}
%Keywords are your own choice of terms you would like the paper to be indexed by.
\keywords{Imperfect information games; online search; Nash equilibrium; Monte Carlo tree search; regret minimization}
\section{Introduction}
Algorithms for creating strong strategies in games have been a core interest of artificial intelligence research. Strong strategies are useful not only as research benchmarks and opponents in digital entertainment, but they are becoming increasingly important also in real world applications, such as critical infrastructure security~\cite{Tambe11}.
Because these games are often played against unknown opponents, a good strategy should be sufficiently robust against any possible counterstrategy used by the adversary. In strictly competitive games, the strategy that is optimal with respect to this criterion is the Nash equilibrium. It is guaranteed to provide the maximal possible expected value against any strategy of the opponent. For example, an equilibrium defense strategy minimizes the maximal damage caused by any attack.
When preparation time is abundant and the exact problem definition is known in advance, an equilibrium strategy for a smaller abstract game can be pre-computed and
then used during game play. This {\it offline approach} has been remarkably successful in
Computer Poker \cite{bowling2015heads,Gilpin09,Sandholm10The,Rubin11Poker,Johanson13Evaluating}.
%and it is also prevalent used in the security applications~\cite{XXX} \mlnote{Citation needed; also, we may be appealing a bit too much to the security domain given the content of the rest of the paper.. maybe just remove this?}. Removed for now.
However, the preparation time is often very limited. The model of the game may become known only shortly before acting is necessary, such as in general game-playing, search or pursuit in a previously unknown environment, and general-purpose robotics.
In these circumstances, creating and solving an abstract representation of the problem may not be possible. In these cases, agents must {\it decide online}: make initial decisions quickly and then spend effort to improve their strategy while the interaction is taking place.
In this paper, we propose a search algorithm that can compute an approximate Nash equilibrium (NE) strategy in two-player zero-sum games {\it online}, in a previously unknown game, and with a very limited computation time and space.
Online Outcome Sampling (OOS) is a simulation-based algorithm based on Monte Carlo Counterfactual Regret Minimization~(MCCFR)~\cite{Lanctot09Sampling}.
% (not sure we need this, and I'm sure reviewers already know) a popular offline equilibrium solving algorithm for this class of games.
%In order to use computational time and space efficiently,
OOS makes two novel modifications to MCCFR. First, OOS builds its search tree incrementally, like Monte Carlo tree search
(MCTS)~\cite{Coulom06Efficient,mctssurvey,UCT}. Second, when performing additional computation after some moves have been played in the match, the following samples are targeted primarily to the parts of the game tree, which are more relevant to the current situation in the game.
We show that OOS is {\it consistent}, \ie it is guaranteed to converge to an equilibrium strategy as search time increases. To the best of our knowledge, this is not the case for any existing online game playing algorithm, all of which suffer from the problem of \emph{non-locality} in these games.
We compare convergence and head-to-head performance of OOS to Information Set
MCTS~\cite{Cowling12ISMCTS,Whitehouse13Integrating,Lisy14selection} in three fundamentally different games.
%To do so,
We develop a novel methodology for evaluating convergence of online algorithms that do not produce a complete strategy.
%but only select a move to play in each particular situation.
The results show that OOS, unlike ISMCTS, consistently converges close to NE strategy in all three games. In head-to-head performance, with sufficient time, OOS either ties or outperforms ISMCTS.
%Our results show that while ISMCTS performs beat shorter search times longer computation times allow OOS to tie or win the direct head-to-head matches.
% Merge-Nov15
% I merged in my modified intro of the text from above.
% My personal opinion is that these kind of paragraphs just waste space (if you name headings appropriately and the flow is structured properly, you don't need to explain the flow like this)
%In this section, we first overview the related work on search in imperfect information games and explain why none of the existing algorithms is guaranteed to converge to Nash equilibrium over time. Then we formally define extensive form games and explain in detail the problem of non-locality of optimal strategies in these games that prevent convergence of the existing search algorithms. We conclude this section by introducing Monte Carlo Counterfactual Regret Minimization, the offline equilibrium computation algorithm that is the basis of OOS.
% Merge-Nov15
% I prefer the title "Background and related work", and I made it a full section so we could follow with the background
%\subsection{Search in Imperfect Information Games}
% classic work: PIMC and its successes
\section{Background and Related Work}
Most imperfect information search algorithms are built upon perfect information search algorithms, such as minimax or Monte Carlo tree search (MCTS).
One of the first popular applications of imperfect information search was Ginsberg's Bridge-playing program, GIB~\cite{Ginsberg96Partition,Ginsberg01}.
In GIB, perfect information search is performed on a {\it determinized} instance of the game: one where all players can see usually hidden information.
This approach has also performed well in other games like Scrabble~\cite{Sheppard02World},
Hearts~\cite{Sturtevant08An}, and Skat~\cite{Buro09Improving}.
Several problems have been identified with techniques based on determinization, such as Perfect Information Monte Carlo
(PIMC)~\cite{Long10Understanding}.
Two commonly reported problems are {\it strategy fusion} and {\it non-locality}~\cite{Frank98Finding}.
Strategy fusion occurs when distinct actions are recommended in states that the player is not able to distinguish due to the imperfect information.
Strategy fusion can be overcome by imposing the proper information constraints during search~\cite{Frank98Finding,Ciancarini10Kriegspiel,Ponsen11Computing,Lisy12peg,Cowling12ISMCTS}.
Non-locality occurs due to optimal payoffs not being recursively defined over subgames as
in perfect information games. As a result, guarantees normally provided by search algorithms built on
subgame decomposition no longer hold.
Previous work has been done for accelerating offline equilibrium computation by solving subgames in end-game situations~\cite{Gilpin06Competitive,Gilpin07Better,Ganzfried13Endgame}. While these techniques tend to help in practice,
non-locality can prevent them from producing equilibrium strategies; we discuss this further in Section~\ref{sec:nonlocality}.
% Merge-Nov15; I kept all this (minor modifications)
The last problem %, sometimes disregarded by the algorithms for search in games,
is the need for randomized strategies in imperfect information games. In many games, any deterministic
strategy predictable by the opponent can lead to arbitrarily large losses.
To overcome this problem, MCTS algorithms sometimes randomize by sampling actions from its empirical frequency counts.
Evidence suggests that the resulting strategies will not converge to the optimal solution even in very
small imperfect information games like Biased Rock-Paper-Scissors and Kuhn poker~\cite{Shafiei09,Ponsen11Computing}.
Our results show that, the empirical frequencies can {\it diverge away} from the equilibrium strategies.
A little more promising are the MCTS approaches that use alternative selection functions, such as Exp3 or Regret Matching \cite{Lisy14selection}.
Variants of MCTS with these selection functions have been shown both formally and empirically \cite{Lisy13nips,Lanctot13smmcts} to converge to Nash equilibrium in games with simultaneous moves, which are the simplest subclass of imperfect information games. However, we show that even these selection functions cannot help to overcome the problem of non-locality.%, if the search is initiated from the current state of the game, as done by all existing search algorithms.
\subsection{Extensive-Form Games}
We focus on two-player zero-sum extensive form games and base our notation on~\cite{OsbRub94}.
These games model sequential decision making \defword{players} denoted
$i \in N = \{ 1, 2 \}$. In turn, players choose \defword{actions} leading to sequences called \defword{histories} $h \in H$.
A history $z \in Z$, where $Z \subseteq H$, is called a \defword{terminal history} and represents a full game from start to end.
At each terminal history $z$ there is a payoff $u_i(z)$
to each player $i$. At each nonterminal history $h$, there is a single
current player to act, determined by $P: H \backslash Z \rightarrow N \cup \{ c \}$ where $c$ is a special player called \defword{chance}
that plays with a fixed stochastic strategy. For example, chance is used to represent rolls of dice
and card draws. The game starts in the empty history $\emptyset$, and
at each step, given the current history $h$, the current player chooses an action $a \in A(h)$ leading to successor history $h' = ha$;
in this case we call $h$ a \defword{prefix} of $h'$ and denote this relationship by $h \sqsubset h'$. Also, for all $h,h',h'' \in H$,
if $h \sqsubset h'$ and $h' \sqsubset h''$ then $h \sqsubset h''$, and $h \sqsubset h$.
Sets $H$ and $A(h)$ are finite and histories have finite length.
%Define $\cI = \{ \cI_i~|~i \in N \}$ the set of information partitions.
Set $\cI_i$ is an partition over $H_i = \{ h~|~P(h) = i \}$ where each part is called an \defword{information set}.
Intuitively, an information set
$I \in \cI_i$ of player $i$ represents a state of the game with respect to what player $i$ knows.
Formally, $I$ is a set of histories that a player cannot tell apart (due to information hidden from that player). For all
$h,h' \in I$, $A(h) = A(h')$ and $P(h) = P(h')$; hence, we extend the definition to $A(I)$, $P(I)$, and denote $I(h)$ the information set
containing $h$.
A \defword{behavioral strategy} for player $i$ is a function mapping each information set $I \in \cI_i$
to a probability distribution over the actions $A(I)$, denoted by $\sigma_i(I)$.
For a profile $\sigma$, we denote the probability of reaching a terminal history $z$ under $\sigma$ as
$\pi^\sigma(z) = \prod_{i \in N\cup{c}} \pi_i(z)$, where each $\pi_i(z) = \prod_{ha \sqsubset z, P(h) = i} \sigma_i(I(h),a)$
is a product of probabilities of the actions taken by player $i$ along $z$.
We use $\pi^{\sigma}_i(h,z)$ and $\pi^{\sigma}(h,z)$ for $h \sqsubset z$ to refer to the product
of only the probabilities of actions along the sequence from the end of $h$ to the end of $z$.
Define $\Sigma_i$ to be the set of behavioral strategies for player $i$.
As is convention, $-i$ refers to player $i$'s opponent.
An \defword{$\epsilon$-Nash equilibrium}, $\sigma$, is a set of $\sigma_i$ for $i \in N$ such that the benefit of switching to some
alternative $\sigma_i'$ is limited by $\epsilon$, i.e.,
%\begin{equation}
%\label{eq:ne}
$\forall i \in N : \max_{\sigma_i' \in \Sigma_i} u_i(\sigma_i', \sigma_{-i}) - u_i(\sigma) \le \epsilon$.
When $\epsilon = 0$, the profile is called a Nash equilibrium.
When $|N| = 2$ and $u_1(z) + u_2(z) = k$ for all $z \in Z$, then the game is a zero-sum game.
In these games, different equilibrium strategies result in
the same expected payoff against any arbitrary opponent equilibrium strategy and at least the same payoff for any opponent strategy.
The \defword{exploitability} of a profile $\sigma$ is the sum of strategies' distances from an equilibrium, $\epsilon_{\sigma} =
\max_{\sigma_1' \in \Sigma_1} u_1(\sigma_1', \sigma_2) + \max_{\sigma_2' \in \Sigma_1} u_2(\sigma_1, \sigma_2')$.
In a \defword{match} (online game), each player is allowed little or no preparation time before playing (preventing the offline computation of approximate equilibria solutions).
The current \defword{match history}, $\ttm \in H$, is initially the empty history $\emptyset$ representing the start of the match. Each turn,
player $P(\ttm)$ is given $t$ time units to decide on a \defword{match action} $\tta \in A(\ttm)$ and the
match history then changes from $\ttm$ to $\ttm\tta$. There is a referee who knows $\ttm$, samples chance outcomes from $\sigma_c(\ttm)$, and reveals $I(\ttm)$ to player $P(\ttm)$. The match terminates when $\ttm \in Z$, giving each player $i$ a payoff of $u_i(\ttm)$.
\subsection{The Problem of Non-Locality}
\label{sec:nonlocality}
\begin{figure}[t]
\begin{center}
%\begin{tikzpicture}
%\node [ch] {}
% child{ node [ma] (i1_1) {}
% child{ node {1}}
% child{ node {0}}
% edge from parent node[left] {0.5}
% }
% child{ node [ma] (i1_2) {}
% child{ node [mi] (i2_1) {}
% child{ node {3}}
% child{ node {0}}
% }
% child{ node [mi] (i2_2) {}
% child{ node {0}}
% child{ node {3}}
% }
% edge from parent node[right] {0.5}
% };
%\end{tikzpicture}
%~~~
\begin{tikzpicture}
\node [ch] {}
child{ node [ma] (i1_1) {}
child{ node {1}}
child{ node {0}}
edge from parent node[left] {0.5}
}
child{ node [ma] (i1_2) {}
child{ node [mi] (i2_1) {}
child{ node {3}}
child{ node {0}}
}
child{ node [mi] (i2_2) {}
child{ node {0}}
child{ node {3}}
}
edge from parent node[right] {0.5}
};
\draw [dashed] (i1_1) -- (i1_2);
\draw [dashed] (i2_1) -- (i2_2) node[midway, above] {I};
\end{tikzpicture}
\end{center}
\vskip-6mm
\caption{An extensive-form game demonstrating the problem of non-locality with maximizing $\bigtriangleup$, minimizing $\bigtriangledown$ and chance $\bigcirc$ players.
\label{fig:coordGame}}
\vskip-5mm
\end{figure}
% Merge-Nov15; as far as I can tell I merged in your version with my small addition at the end.
We demonstrate the problem of non-locality on the game in Figure~\ref{fig:coordGame}. It starts with a decision of chance $\bigcirc$ which leads to two nodes that cannot be distinguished by the maximizing player $\bigtriangleup$. On the left, the game ends after his actions. On the right, both his actions lead to information set $I$ of the minimizing player $\bigtriangledown$, whose utility is minus the depicted value.
The optimal (Nash equilibrium) strategy in $I$ depends on the value of the leftmost leaf, even though this outcome of the game cannot be reached once the game entered information set $I$.
Because $\bigtriangleup$ does not know at the time of his move if $\bigtriangledown$ will play, he needs to randomize its decision so that $\bigtriangledown$ cannot easily make the move that is more harmful for $\bigtriangleup$. However, he would prefer to play left for the case that $\bigtriangledown$ will not play. Player $\bigtriangledown$ knows that $\bigtriangleup$ has this dilemma and tries to exploit it as much as possible. Uniform strategy at $I$ clearly motivates $\bigtriangleup$ to play left, leading to expected utility $0.5\cdot 1 + 0.5\cdot1.5 = 1.25$, instead of $0.5\cdot 0 + 0.5\cdot1.5 = 0.75$.
If $\bigtriangledown$ plays right a little more often, $\bigtriangleup$ is still motivated to play left and $\bigtriangledown$ improves his payoff.
This holds until $\bigtriangledown$ plays (left,right) with probabilities ($\frac{1}{3}$,$\frac{2}{3}$),
which results to expected reward of 1 for both actions of $\bigtriangleup$.
If $\bigtriangleup$ plays $(\frac{1}{2},\frac{1}{2})$, $\bigtriangledown$ cannot lower the utility anymore and the pair of strategies is an equilibrium.
If the leftmost leaf was 2 instead of 1, the optimal strategy in $I$ would be ($\frac{1}{6}$,$\frac{5}{6}$).
%The optimal (Nash equilibrium) strategy for the game on the right is obtained by solving $\max_{\sigma_1 \in \Sigma_1} \min_{\sigma_2 \in \Sigma_2} u_1(\sigma_1, \sigma_2)$, which gives $\bigtriangleup$ to play $(\frac{1}{2},\frac{1}{2})$ and for $\bigtriangledown$ to play (left,right) with probabilities ($\frac{1}{3}$,$\frac{2}{3}$). The expected value of the game is $1$.
Current approaches, after reaching the information set $I$, will repeatedly sample and search one of subtrees in $I$, aggregating the information collected to make a final recommendation.
%When hidden information is revealed for search, Russel and Norvig refer to this technique as ``averaging over clairvoyance''~\cite{russellnorvig}.
However, even if the information structure is kept intact, such as in ISMCTS~\cite{Cowling12ISMCTS}, the problem still occurs. If subtrees of $I$ are sampled equally often, a searching player will not have any preference between left and right and will recommend $(\frac{1}{2},\frac{1}{2})$, which is suboptimal.
% However, mixing uniformly at $I$ is not part of an equilibrium in this game. The payoff to $\bigtriangleup$ for playing right would be $\frac{1}{2}\cdot 0 + \frac{1}{2} \cdot \frac{3}{2} = \frac{3}{4}$, which would give $\bigtriangleup$ incentive to switch to play left more often (since its expected value is $\frac{5}{4}$), in turn giving $\bigtriangledown$ incentive to deviate.
Moreover, the uniform probability of being in each state of $I$ corresponds to the distribution over the states given the optimal play, so the problem occurs even if subtrees are sampled from the correct belief distribution.
% this is mentioned later (after the proof)
% and no search algorithm starting only from the current match history $\ttm$ without some extra information cannot converge to the optimal strategy.
Note that this is a simple example; in larger games, this problem could occur over much longer paths or many times in different parts of the game.
The analysis in~\cite{Long10Understanding} suggests that the effect may be critical in games with low {\it disambiguation factor}, where private information is slowly (or never) revealed throughout a match.
To the best of our knowledge, OOS is the first online search algorithm that solves this problem.
It does it by starting each sample from the root of the game. If the computed strategy tends to come closer to the uniform strategy in $I$, the updates in the maximizing player's information set will modify the strategy to choose left more often. The following samples will reach $I$ more often at the left state and consequently modify the strategy in $I$ in the right direction.
\subsection{Offline Equilibrium Approximation}
There are multiple algorithms for computing approximate equilibrium strategies offline~\cite{Sandholm10The}. We focus on a popular choice among Poker researchers due to its sampling variants.
Counterfactual Regret (CFR) is a notion of regret at the information set level for extensive-form games~\cite{CFR}.
The CFR algorithm iteratively learns strategies in self-play, converging to an equilibrium.
The \defword{counterfactual value} of reaching information set $I$ is the expected payoff given that player $i$ played to reach $I$, the opponents played
$\sigma_{-i}$ and both players played $\sigma$ after $I$ was reached:
\begin{equation}
\label{eq:cfv}
v_i(I,\sigma) = \sum_{(h,z) \in Z_I} \pi^{\sigma}_{-i}(h) \pi^{\sigma}(h,z) u_i(z),
\end{equation}
where $Z_I = \{ (h,z)~|~z \in Z, h \in I, h \sqsubset z \}$.
Suppose, at time $t$, player $i$ plays with strategy $\sigma^t_i$.
Define $\sigma^t_{I \rightarrow a}$ as identical to $\sigma^t$ except at $I$ action $a$ is taken with probability $1$.
The counterfactual regret of not taking $a \in A(I)$ at time $t$ is $r^t(I,a) = v_i(I,\sigma^t_{I \rightarrow a}) - v_i(I,\sigma^t)$.
The algorithm maintains the cumulative regret $R^T(I,a) = \sum_{t=1}^T r^t(I,a)$, for every action at every information set.
Then, the distribution at each information set for the next iteration $\sigma^{T+1}(I)$ is obtained using
regret-matching~\cite{Hart00}. The distribution is proportional to the positive portion of the individual actions' regret:
\begin{equation*}
\label{eq:rm}
\sigma^{T+1}(I,a) = \left\{
\begin{array}{ll}
R^{T,+}(I,a) / R^{T,+}_{sum}(I) & \mbox{if } R^{T,+}_{sum}(I) > 0 \\
1 / |A(I)| & \mbox{otherwise,}
\end{array} \right.
\end{equation*}
where $x^+ = \max(0,x)$ for any term $x$, and $R^{T,+}_{sum}(I) = \sum_{a' \in A(I)} R^{T,+}(I,a')$. Furthermore, the algorithm maintains for each information set the average strategy profile
\begin{equation}
%\bar{\sigma}^T(I,a) = \frac{1}{T}\sum_{t=1}^T \sigma^t(I,a).
\bar{\sigma}^T(I,a) = \frac{\sum_{t=1}^T \pi^{\sigma^t}_i(I) \sigma^t(I,a)}{\sum_{t=1} \pi^{\sigma^t}_i(I)},
\end{equation}
where $\pi^{\sigma^t}_i(I) = \sum_{h \in I}\pi^{\sigma^t}_i(h)$.
The combination of the counterfactual regret minimizers in individual information sets also minimizes the overall average regret \cite{CFR}, and hence the average profile is a $2\epsilon$-equilibrium, with $\epsilon \rightarrow 0$
as $T \rightarrow \infty$.
Monte Carlo Counterfactual Regret Minimization (MCCFR) applies CFR to sampled portions of the games~\cite{Lanctot09Sampling}.
In the \defword{outcome sampling} (OS) variant of the algorithm, a single terminal history $z\in Z$ is sampled in each iteration.
The algorithm updates the regret in the information sets visited along $z$ using the
\defword{sampled counterfactual value},
\begin{equation*}
\tilde{v}_i(I,\sigma) = \left\{
\begin{array}{ll}
\frac{1}{q(z)} \pi^{\sigma}_{-i}(h) \pi^{\sigma}(h,z) u_i(z) & \mbox{if } (h,z) \in Z_I\\
0 & \mbox{otherwise,}
\end{array} \right.
\end{equation*}
where $q(z)$ is the probability of sampling $z$.
If every $z \in Z$ has non-zero probability of being sampled, $\tilde{v}_i(I,\sigma)$ is an unbiased estimate of $v(I,\sigma)$
due to the importance sampling correction ($1/q(z)$). For this reason, applying CFR updates using these sampled counterfactual values
on the sampled information sets values also eventually converges to the approximate equilibrium of the game with high probability.
%The required number of iterations for convergence is much larger, but each iteration is much faster.
%In Poker, CFR and MCCFR have been used with much success as offline methods
%for pre-computing approximate equilibria in abstract games~\cite{CFR,Johanson12CFRBR}; the same general
%approach has also been used in Liar's Dice~\cite{Neller11,Lanctot12IR}.
\section{Online Outcome Sampling}
When outcome sampling is used in the offline setting, data structures for all information sets are allocated before the first iteration. In each iteration, all sampled information sets get updated.
We make two essential modifications to adapt outcome sampling to the online setting.
{\bf Incremental Game Tree Building.} Before the match begins, only the very first (root) information set is added to memory.
In each iteration, a single information set (at most) is added to memory.
In particular, when an information set is reached that is not in memory, it is added to memory and then a default
playout policy (\eg, uniform random) takes over for the remainder of the simulation.
Along the playout portion (tail) of the simulation, information sets are not added to memory nor updated.
Along the tree portion (head), information sets are updated as normal.
This way, only the relevant information sets are stored in the memory.
{\bf In-Match Search Targeting.}
Suppose several moves have been played since the start of the match leading to $\ttm$.
Plain outcome sampling would continue to sample from the root of the game (not the current match history $\ttm$), rarely reaching the region of the game space that the match has headed toward.
Hence, the second modification we propose is directing the search towards the histories that are more likely to occur during the match currently played.
Note that the complete history is typically unknown to the players, who know only their information sets.
Unlike in ISMCTS, OOS always runs samples from the root of the game tree, even with non-empty match history.
We now describe two specific targeting methods demonstrated in Figure~\ref{fig:targeting}.
\begin{figure}[t]
\begin{center}
\includegraphics[scale=1]{fig/targeting}
\end{center}
\vskip-5mm
\caption{Example subgames targeted by IST (left) and PST (right). All histories $\{~h~|~h \in I' \cup I'' \cup I(\ttm_2)~\}$
share a common public action sequence ($p(\ttm_2)$). \label{fig:targeting}}
\end{figure}
\subsection{Information Set Targeting (IST)}
Suppose the match history is $\ttm$. IST samples histories reaching the current information set ($I(\ttm)$),
i.e., $(h,z) \in Z_{I(\ttm)}$, with higher probability than other histories.
The intuition is that these histories are particularly
relevant since the searching player {\it knows} that one of these $z$ will describe the match at its completion.
However, focusing {\it only} on these histories may cause problems because of the non-locality and the convergence guarantees would be lost.
Consider again the game in Figure~\ref{fig:coordGame}.
If the minimizing player knows it is in the information set $I$ and focuses all its search to this information set, she computes the suboptimal uniform strategy.
Any fixed non-zero probability of sampling the left chance action
eventually solves the problem. The regrets are multiplied by the reciprocal of the sampling probability; hence, they influence the strategy
in the information set proportionally stronger if the samples are rare.
Note that previous methods, such as PIMC and ISMCTS, {\it never} update any information sets, which are not reachable from $I(\ttm)$. In contrast, in IST {\it all} information sets in memory have positive probability of being reached and updated.
\subsection{Public Subgame Targeting (PST)}
IST may ``reveal'' information about the current information set available to only one of the players affecting the computed strategy (this vanishes in the limit due to consistency but could have a short-term effect). PST only uses information available to both players to target the search to prevent this effect.
A \defword{public action} is an action in the ``public tree'' defined in \cite{12aamas-pcs} and commonly used in offline CFR. Informally, an action is said to be public if it is observable by all players (e.g., bids in Liar's Dice and bets in Poker are public). Formally, an action $a$ is public, iff
$\forall i, \forall I \in \cI_i, \forall h_1,h_2\in I: a\in h_1 \Leftrightarrow a\in h_2$.
For example, the extensive-form version of Rock, Paper, Scissors has two information
sets $I_1 = \emptyset$ and $I_2 = \{ r, p, s \}$; it has no public actions, because each history in
$I_2$ contains a single unique action (the unobserved ones taken by the first player).
Given a history $h$, let $p(h)$ be the sequence of public actions along $h$ in the same order that they were taken in $h$.
Define the \defword{public subgame} induced by $I$ to be the one whose terminal history set is
\begin{equation*}
Z_{p,I(h)} = \{(h',z)~|~z \in Z, h' \in H, p(h') = p(h), h' \sqsubset z \}.
\end{equation*}
With match history $\ttm$, PST samples $z \in Z_{p,I(\ttm)}$ with higher probability than terminal histories outside this set.
In a game of poker, suppose the chance decides on the private cards of the players,
the first player bets one chip and the second player calls. At this point, the public actions are: bet one chip and call.
The public subgame $Z_{p,I(h)}$ contains every terminal history (including every combination of private chance
outcomes for all players) whose first two public actions are: bet one chip and call.
\subsection{Algorithm}
\begin{algorithm2e}[t]
OOS$(h, \pi_i, \pi_{-i}, s_1, s_2, i)$: \;
\pushline
\If{$h \in Z$}{
\breturn $(1, \delta s_1 + (1-\delta)s_2, u_i(z))$\; \label{alg:terminal}
}
\ElsIf{$P(h) = c$}{
Sample an outcome $a$ and let $\rho_1,\rho_2$ be its probability in targeted and untargeted setting \; \label{alg:chancesample}
$(x, l, u) \gets $ OOS$(ha, \pi_i, \rho_2 \pi_{-i}, \rho_1 s_1 , \rho_2 s_2, i)$ \;
\breturn $(\rho_2 x, l, u)$ \;
}
$I \gets $ getInfoset$(h, P(h))$ \;
Let $(a,s_1',s_2') \leftarrow $ Sample$(h, I, i, \epsilon)$ \; \label{alg:sample}
\If{$I$ is not in memory}{
Add $I$ to memory \;
$\sigma(I) \leftarrow \mbox{Unif}(A(I))$ \;
$(x, l, u) \gets $ Playout$(ha, \frac{\delta s_1 + (1-\delta)s_2}{|A(I)|})$ \; \label{alg:playout}
}
\Else{
$\sigma(I) \gets $ RegretMatching$(r_I)$ \;
$\pi_{P(h)}' \gets \sigma(I,a)\pi_{P(h)}$ \; \label{alg:newreaches1}
$\pi_{-P(h)}' \gets \pi_{-P(h)}$ \; \label{alg:newreaches2}
$(x, l, u) \gets$ OOS$(ha, \pi_i', \pi_{-i}', s_1', s_2', i)$ \;
}
$c \gets x$ \; \label{alg:suffix1}
$x \gets x \sigma(I,a)$ \; \label{alg:suffix2}
\For{$a' \in A(I)$}{
\If{$P(h) = i$}{
$W \gets u \pi_{-i}~/~l$ \;
\If{$a' = a$}{
$r_I[a'] \gets r_I[a'] + (c - x)W$ \;
}
\Else{
$r_I[a'] \gets r_I[a'] - xW$ \;
}
}
\Else{
$s_I[a'] \gets s_I[a'] + \frac{1}{\delta s_1 + (1-\delta)s_2} \pi_{-i} \sigma(I,a')$ \; \label{alg:avgstrat}
}
}
\breturn $(x, l, u)$ \; \label{alg:returnend}
\popline
\vspace{0.1cm}
\caption{Online Outcome Sampling. \label{alg}}
\end{algorithm2e}
The algorithm iteratively samples a single trajectory from the root $\emptyset$ to a
terminal history. At each information set in memory, $I$, it maintains two tables: $r_I$ stores cumulative
regret for each action $a \in A(I)$, and $s_I$ stores the cumulative average strategy probability of each
action.
Depending on the targeting method (IST or PST), $Z_{sub}$ is one of $Z_{I(\ttm)}$ or $Z_{p,I(\ttm)}$.
The pseudo-code is presented as Algorithm~\ref{alg}.
Each iteration is represented by two calls of OOS where the update player $i \in \{1,2\}$ is alternated.
Before each iteration, a {\it scenario} is decided:
with probability $\delta$ the iteration targets the subgame and chooses $z \in Z_{sub}$
and with probability $(1-\delta)$ the usual OS sampling determines $z \in Z$.
As a result, OOS with $\delta=0$ becomes MCCFR with incremental tree building.
The first parameter of OOS is the current history.
The next two are strategy's reach probabilities for the update player $i$ and the opponent.
The third and fourth parameters are initially the overall probabilities that the current sample is generated, one for each scenario: first the targeted and then the untargeted.
With non-empty match history, these two parameters include an additional weighting factor $w_T$ explained later.
The last parameter is the update player. Initial calls have the form OOS$(\emptyset, 1, 1, 1/w_T, 1/w_T, i)$.
For the return values, $x$ is a suffix/tail reach probability for both players,
$l$ is the root-to-leaf sample probability, and $u$ is the payoff of the trajectory in view
of the update player.
The $\epsilon$-on-policy sampling distribution used at each information set
is defined as
\begin{equation*}
\label{eq:ossample}
\Phi(I,i) = \left\{
\begin{array}{ll}
\epsilon \cdot \mbox{Unif}(A(I)) + (1-\epsilon)\sigma_i(I) & \mbox{if } P(I) = i\\
\sigma_i(I) & \mbox{otherwise,}
\end{array} \right.
\end{equation*}
and denote $\Phi(I,i,a)$ the probability of sampling $a \in A(I)$.
The sampling at chance's choices on line~\ref{alg:chancesample} depends on the method and the scenario being used. For example, when using information set targeting, the outcome that is sampled must be consistent with match history.
A critical part of the algorithm is the action chosen and sample reach updates on line~\ref{alg:sample}. If $I$ is not in memory, then an action is sampled uniformly. Otherwise, in the targeted scenario, the current history $h$ is always in the targeted part of the game and an action from $\{a~|~\exists z\in Z \; (ha,z)\in Z_{sub}\}$ is selected using the distribution $\Phi(I(h),i)$ normalized to one on this subset of actions. If we define $sum=\sum_{(ha,z)\in Z_{sub}}\Phi(I,i,a)$ then $s_1' = s_1\Phi(I,i,a)/sum$. In the untargeted scenario, any action $a \sim \Phi(I,i)$ can be sampled. If the action is not leaving the targeted part of the game (i.e., $(ha,z)\in Z_{sub}$) then $s_1' = s_1\Phi(I,i,a)/sum$ otherwise $s_1'=0$. In all cases $s_2' = \Phi(I,i,a) s_2$.
These sample reach probabilities are combined into one at a terminal history on line~3, %\ref{alg:terminal}
start of the playout on line~\ref{alg:playout}, and when updating the average
strategy on line~\ref{alg:avgstrat}.
The playout on line~\ref{alg:playout} samples to the end of the game with some playout policy at each step; we use uniform random,
but in general one could use an informed policy based on domain knowledge as well.
Unlike MCTS, the playout policy in OOS must compute $l$ when reaching a terminal and update the tail probability $x$ when returning
as done on line~\ref{alg:suffix2}. Lines~\ref{alg:newreaches1} and \ref{alg:newreaches2} modify $P(h)$'s reach probability
by multiplying it by $\sigma(I,a)$, keeping the other value the same.
Lines~\ref{alg:suffix1} to 24
contain the usual outcome sampling updates. Note that regrets are updated at the
update player histories, while average strategy tables at opponent histories. \\
% I am fine with not giving this a title, just a single line of whitespace
Now we explain the role of the weighting factor $w_T$. Note that on lines 23 and 29, the regret and strategy updates are multiplied by the reciprocal of the probability of sampling the sample leaf and the current state.
Consider the updates in the current information set of the game $I_\ttm$. In the initial samples of the algorithm with empty match history, this information set was on average sampled with a very low probability $s_0$. Let's say that out of $10^6$ samples started from the root, 1000 samples reached this particular information set. In that case, $s_0=0.001$ and the regret updates caused by each of these samples were multiplied by $\frac{1}{s_0}=1000$.
Now the game has actually reached the history $\ttm$ and due to targeting, half of the next $10^6$ samples reach $I(\ttm)$. It means that $s_\ttm=0.5$ and the regrets will be multiplied only by 2.
As a results, the updates of the first (generally less precise) 1000 samples are all together weighted the same as the $5\times 10^5$ later samples, which makes it almost impossible to compensate for the initial errors.
In order to prevent this effect, we add the weighting factor $w_T$ to compensate the change of targeting and make each of the samples have similar weight at $I(\ttm)$.
In our example, $w_T=\frac{s_\ttm}{s_0}$. More formally, when running the iterations from match history $\ttm$, we define the weighting factor as the probability of reaching $I(\ttm)$ without any targeting divided by the probability of reaching $I(\ttm)$ with the current targeting, assuming the players play according to the current mean strategy profile $\bar{\pi}$:
\vskip-6mm
\begin{equation}
\frac{1}{w_T(\ttm)} = (1-\delta) + \delta\frac{\sum_{(h,z)\in I(\ttm)} \bar{\pi}(h)}{\sum_{z\in Z_{sub(\ttm)}} \bar{\pi}(z)}.
\end{equation}
OOS would quickly stop improving a strategy in information sets that are below an irrational move of the opponent ($\pi_{-i}=0$). This cannot be avoided even if many samples are targeted to the information set. Therefore, we use a more explorative regret matching,
$\sigma^{T+1}_\gamma(I,a) = \gamma/|A(I)| + (1-\gamma) \sigma^{T+1}(I,a)$, with $\gamma = 0.01$.
This affects the worst-case regret bound at most by $\gamma$ (due to the linearity of the expected utility), but allows for much better play.
\subsubsection{Consistency}
\begin{theorem}
Let $\bar{\sigma}^t_m(\delta,\ttm)$ be a strategy produced by OOS with scheme IST or PST using $\delta < 1$ started from $\ttm$ run for $t$ iterations, with exploration $\epsilon > 0$.
For any $p \in (0, 1], \varepsilon > 0$ there exists $t < \infty$ such that with
probability $1-p$ the strategy $\bar{\sigma}^t_m(\delta,\ttm)$ is a $\varepsilon$-equilibrium strategy.
\label{thm:consistency}
\end{theorem}
\begin{proof}(Sketch) Each terminal history has nonzero probability of being sampled, eventually every information
set will be contained in memory. The algorithm then becomes MCCFR with a non-uniform sampling scheme.
By \cite[Theorem 5]{Lanctot09Sampling} OOS minimizes regret with high probability. The additional weighting of all updates of the algorithm by the same constant has no influence on the computed average strategy and the strategies computed by regret matching, because it uses only relative proportions of the regrets.
\end{proof}
%Note that due to non-locality, this consistency property cannot hold generally for any search algorithm that does not modify $\sigma(I)$ at previous $I(h)$ such that $h \sqsubset \ttm$.
%However, it is an open question as to whether any of the previous algorithms could be modified to ensure consistency.
\section{Empirical Evaluation}
We now compare the exploitability and head-to-head
performance
of OOS and ISMCTS. % on three fundamentally different imperfect games. with different sources of imperfect information.
\subsection{Information Set Monte Carlo Tree Search}
We have implemented ISMCTS in the same framework with similar data representations as OOS. ISMCTS runs MCTS samples as in a perfect information game, but uses common statistics computed for the whole information set and not individual states in the selection phase. When initiated from a non-empty match history, it starts samples uniformly from the states in the current information set. We use two selection functions in our experiments. First, the commonly used UCT as suggested in \cite{Cowling12ISMCTS}. We use two times the maximal game outcome as the exploration parameter $C$. In matches, we select the action with the highest number of iterations. For evaluating convergence, this would often have the maximum possible exploitability in the evaluated games. Therefore, we use the distribution given by the empirical frequencies for this part of evaluation. Second, we use Regret Matching (RM) as the selection strategy, because it has been shown to empirically perform better in many games \cite{Lisy14selection} and because it is based on the same principles that are behind CFR. For RM, we use exploration $0.2$, the mean strategy for evaluating exploitability and samples from this strategy in head to head matches. In the following, we refer to ISMCTS with the corresponding selection function by only UCT/RM.
\subsection{Evaluating Performance}\label{sec:evaluating}
In games like Poker and Liar's Dice, it is often critical to play in such a way that the opponent
cannot easily infer the private information. This explains partly
the success of CFR-based methods in the offline setting.
In the online setting, since the tree is built incrementally, only partial strategies
are produced.
We are unaware of any methods for assessing the worst-case exploitability of strategies
produced by an online search algorithm.
%We therefore propose two new methods to approximate the exploitability of the produced strategies.
% "full stitching"
%\begin{table}
%{\small
%\begin{center}
%\begin{tabular}{ccccc}
%Algorithm & Time & $\epsilon_1$ & $\epsilon_2$ & $\epsilon_\sigma$ \\
%\hline
%ISMCTS & 1s & 0.235 & 0.574 & 0.808 \\
%IST & 1s & 0.337 & 0.311 & 0.648 \\
%PST & 1s & 0.203 & 0.211 & 0.414 \\
%\hline
%ISMCTS & 5s & 0.251 & 0.548 & 0.799 \\
%IST & 5s & 0.229 & 0.295 & 0.524 \\
%PST & 5s & 0.148 & 0.125 & 0.273 \\
%\hline
%\end{tabular}
%\caption{LD(1,1) exploitability using full-stitching, $\delta = 0.9$.}
%\label{tbl:fullstitching}
%\end{center}
%}
%\end{table}
% I modified the text here; also included the results in here, but hard to shorten it further.
% On second thought: I am not sure we can include this in the end. The numbers look odd because
% they don't match with the rest. Leaving it out for now to save space. This might hurt us,
% we should think about what to do here.
%There is a brute-force method to compute exploitability of an online search algorithm.
%Enumerate every information set in the game in a topological order from the root, performing a $t$-second search
%for each one (sampling histories using belief distributions from previous searches closer to the root), making sure
%not to unfairly use any saved statistics other than the ones computed at searches from parent information sets.
%This simulates every possible way the game can be played, and takes $O(t |I|)$ seconds.
%The resulting exploitabilities in LD(6,1,1), with $\gamma = 0.9$, 1 second search time are
%$0.808$, $0.648$, and $0.414$ for ISMCTS, IST, and PST, respectively. We repeated the experiment with 5 seconds search time,
%and obtained $0.799$, $0.524$, and $0.273$, respectively, confirming that OOS results in lower exploitability as search time increases.
%However, since the it requires constantly saving and reloading memory states and running both sides separately to avoid
%bias from the opponent's algorithm, the experiment took several days and is impractical for larger games.
There is a brute-force approach that runs the search at each information set and computes exploitability of a strategy composed from strategies computed for these information sets. However, it requires $O(t |I|)$ seconds for a single run and multiple runs are required for statistically significant results.
Instead, we propose an approximate multi-match \defword{aggregate method}.
It creates a global (accumulating) strategy data structure for each player.
Then, it runs a fixed number of matches (500 in our experiments) of the search algorithm against a random opponent. In each information set reached in the match, the information computed (visit counts in ISMCTS, $s_I$ in OOS) is added into the global strategy of the player who searched.
Besides the information in the current information set, the information computed in all successor information sets is also added with a weight proportional to the number of visits of the information set.
If an information set is never reached directly, this gives at least some estimate of how the strategy would behave there. If it is, the number of samples in the information set out-weights the less precise estimates from the cases where it is not. In the information sets not added to the global strategy, a fixed random action is selected. The result is the exploitability of the global strategy.
%For a fair comparison, the first $I$ reached along each $m$ aggregates all the information gained in the search
%but for future $I'$, only the information collected in each $I''$ reachable by $I'$ is aggregated.
% and generates a set of matches $M$. Then, each $m \in M$ is simulated invoking the appropriate search algorithm at each observed $I$ along $m$.
%Since $m$ is predetermined, the choice made by the search algorithm is discarded, but the information computed (visit counts in ISMCTS, $s_I$ in OOS) is added into the global strategy data structure of the player who searched.
%Note that it is safe to combine the information in this way: in ISMCTS the actions chosen and visits are independent of how $I'$ was reached. In OOS, the accumulating $s_I$ values of two converging $\epsilon$-equilibrium average strategies can be combined due to linearity of expected utility.
\subsection{Domains}
\vlnote{comment on domain sizes}
\textbf{Imperfect Information Goofspiel} In II-GS($N$), each player is
given a private hand of bid cards with values $0$ to $N-1$. A different
deck of $N$ point cards is placed face up in a stack.
On their turn, each player bids for the top point card by secretly choosing a single card in their hand.
The highest bidder gets the point card and adds the point total to their score, discarding
the points in the case of a tie.
This is repeated $N$ times and the player with the highest score wins.
In II-Goofspiel, the players only discover who won or lost a bid, but not the bid cards played.
Also, we assume the point-stack is strictly increasing: 0, 1, $\ldots N-1$.
This way the game does not have non-trivial chance nodes, all actions are private and information sets have various sizes.
\textbf{Liar's Dice} LD($D_1$,$D_2$), also known as Dudo, Perudo, and Bluff is a dice-bidding game.
Each die has faces \epsdice{1} to \epsdice{5} and a star $\star$.
Each player $i$ rolls $D_i$ of these dice without showing them to their opponent.
Each round, players alternate by bidding on the outcome of all dice in play until one player ``calls liar'',
\ie claims that their opponent's latest bid does not hold.
If the bid holds, the calling player loses; otherwise, she wins.
A bid consists of a quantity of dice and a face value.
A face of $\star$ is considered wild and counts as matching any other face.
To bid, the player must increase either the quantity or face value of the current
bid (or both).
All actions in this game are public. The only hidden information is caused by chance at the beginning of the game. Therefore, the size of all information sets is identical.
\textbf{Generic Poker} GP($T,C,R,B$) is a simplified poker game inspired by Leduc Hold'em. First, both players are required to put one chip in the pot.
Next, chance deals a single private card to each player, and the betting round begins.
A player can either \emph{fold} (the opponent wins the pot), \emph{check} (let the opponent make the next move), \emph{bet} (add some amount of chips, as first in the round), \emph{call} (add the amount of chips equal to the last bet of the opponent into the pot), or \emph{raise} (match and increase the bet of the opponent).
If no further raise is made by any of the players, the betting round ends, chance deals one public card on the table, and a second betting round with the same rules begins.
After the second betting round ends, the outcome of the game is determined --- a player wins if: (1) her private card matches the table card and the opponent's card does not match, or (2) none of the players' cards matches the table card and her private card is higher than the private card of the opponent. If no player wins, the game is a draw and the pot is split.
The parameters of the game are the number of types of the cards ($T$), the number of cards of each type ($C$), the maximum length of sequence of raises in a betting round ($R$), and the number of different sizes of bets (i.e., amount of chips added to the pot) for \emph{bet}/\emph{raise} actions.
This game is similar to Liar's Dice in having only public actions. However, it includes additional chance nodes later in the game, which reveal part of the information not available before. Moreover, it has integer results and not just win/draw/loss.
\begin{figure*}[t]
\hskip0.3cm
\hfill
\includegraphics[height=3.5mm]{fig/legend_algs}
\hfill
\includegraphics[height=3.5mm]{fig/legend_oos}
\hspace{0.8cm}
\rotatebox{90}{\hskip-1.5cm II Goofspiel (6)}
\subfigure[Root]{\label{fig:GS-root}
\begin{minipage}{0.235\textwidth}
\includegraphics[width=\textwidth]{fig/convergence-IIGS_1_6}
\end{minipage}}
\subfigure[Aggregated]{\label{fig:GS-agg}
\begin{minipage}{0.235\textwidth}
\includegraphics[width=\textwidth]{fig/agg_convergence-IIGS_1_6}
\end{minipage}}
\subfigure[OOS-IST Params.]{\label{fig:GS-oos}
\begin{minipage}{0.235\textwidth}
\includegraphics[width=\textwidth]{fig/agg_OSS-IST-WC_convergence-IIGS_1_6}
\end{minipage}}
\begin{minipage}{0.235\textwidth}
\center
No public subgame
\end{minipage}
\rotatebox{90}{\hskip-1.5cm Liar's Dice (6,1,1)}
\subfigure[Root]{\label{fig:LD-root}
\begin{minipage}{0.235\textwidth}
\includegraphics[width=\textwidth]{fig/convergence-LD_6_1_1}
\end{minipage}}
\subfigure[Aggregated]{\label{fig:LD-agg}
\begin{minipage}{0.235\textwidth}
\includegraphics[width=\textwidth]{fig/agg_convergence-LD_6_1_1}
\end{minipage}}
\subfigure[OOS-IST Params.]{\label{fig:LD-IST-oos}
\begin{minipage}{0.235\textwidth}
\includegraphics[width=\textwidth]{fig/agg_OSS-IST-WC_convergence-LD_6_1_1}
\end{minipage}}
\subfigure[OOS-PST Params.]{\label{fig:LD-PST-oos}
\begin{minipage}{0.235\textwidth}
\includegraphics[width=\textwidth]{fig/agg_OSS-PST-WC_convergence-LD_6_1_1}
\end{minipage}}
\rotatebox{90}{\hskip-1.5cm Gen. Poker (3,3,2,2)}
\subfigure[Root]{\label{fig:GP-root}
\begin{minipage}{0.235\textwidth}
\includegraphics[width=\textwidth]{fig/convergence-GP_2_2_3_3}
\end{minipage}}
\subfigure[Aggregated]{\label{fig:GP-agg}
\begin{minipage}{0.235\textwidth}
\includegraphics[width=\textwidth]{fig/agg_convergence-GP_2_2_3_3}
\end{minipage}}
\subfigure[OOS-IST Params.]{\label{fig:GP-oos-ist}
\begin{minipage}{0.235\textwidth}
\includegraphics[width=\textwidth]{fig/agg_OSS-IST-WC_convergence-GP_2_2_3_3}
\end{minipage}}
\subfigure[OOS-PST Params.]{\label{fig:GP-oos-pst}
\begin{minipage}{0.235\textwidth}
\includegraphics[width=\textwidth]{fig/agg_OSS-PST-WC_convergence-GP_2_2_3_3}
\end{minipage}}
\caption{Exploitability ISMCTS with various selection functions and OOS with various parameter settings in the smaller game variants (rows). }\label{fig:small}
\end{figure*}
\subsection{Results}
We first focus on LD(1,1), II-GS(6), and GP(3,3,2,2). While these games are considered small by search algorithm standards, it is still possible to compute the exact worst case counterstrategies (i.e., best response) to measure exploitability. After analyzing the convergence, we proceed to performance in head-to-head matches also in larger variants of the games.
\subsubsection{Convergence at the root}
First we evaluate the speed of convergence of the proposed algorithm when run in the root of the games. The strategy in the information sets, which are not included in the tree yet, is assumed to be a fixed random action identical for all algorithms. The impact of the exploration parameters of the algorithms is not very strong here and we set it to $0.6$ for OOS. The results are in Figures~\ref{fig:small}(a,d,h). OOS clearly converges the fastest in all games, eventually converging to the exact equilibrium. The exact convergence is not surprising, because the algorithm run from the root is MCCFR, only with incremental tree building.
In all three games, the empirical action frequencies in UCT first converge quite quickly towards less exploitable strategies, but they always start to become more exploitable at some point. This does not happen with RM, but its initial convergence is slower.
\begin{figure*}[t]
\subfigure[II Goofspiel (6)]{\label{fig:GS-matches-small}
\begin{scriptsize}
\begin{tabular}{@{}|@{}r@{}|@{~}c@{~}c@{~}c@{~}c@{}|}\hline
0.1s&OOS&UCT&RM&RND\\\hline
OOS&49.3(2.7)&50.0(1.3)&50.5(1.3)&83.1(1.5)\\
UCT&51.2(2.2)&62.9(2.6)&62.7(2.5)&84.0(2.1)\\
RM&52.3(1.4)&70.6(1.7)&73.1(2.1)&87.8(1.4)\\
RND&17.1(2.2)&15.7(2.2)&10.3(1.8)&49.7(3.0)\\
\hline
\end{tabular}
\end{scriptsize}}
\subfigure[Liar's Dice (1,1)]{\label{fig:LD-matches-small}
\begin{scriptsize}
\begin{tabular}{@{}|@{}r@{}|@{~}c@{~}c@{~}c@{~}c@{}|}\hline
0.1s&OOS&UCT&RM&RND\\\hline
OOS&48.5(2.0)&51.7(2.0)&56.8(2.8)&83.3(1.5)\\
UCT&45.7(2.0)&49.7(2.0)&52.9(2.8)&87.2(1.3)\\
RM&47.9(2.8)&49.0(2.8)&54.3(2.8)&84.8(2.0)\\
RND&16.9(1.5)&19.5(1.6)&18.1(2.2)&47.7(2.0)\\
\hline
\end{tabular}
\end{scriptsize}}
\subfigure[Generic Poker (3,3,2,2)]{\label{fig:GP-matches-small}
\begin{scriptsize}
\begin{tabular}{@{}|@{}r@{}|@{\hspace{1pt}}c@{~}c@{~}c@{~}c@{}|}\hline
0.1s&OOS&UCT&RM&RND\\\hline
OOS&0.25(0.53)& -0.45(0.10)& -0.40(0.10)& 1.85(0.16)\\
UCT& 0.53(0.10)&0.21(0.15)&0.20(0.15)&1.35(0.36)\\
RM& 0.39(0.13)&0.21(0.40)&-0.12(0.32)&1.75(0.44)\\
RND& -2.17(0.20)&-2.22(0.54)&-3.24(0.55)&1.07(0.61)\\
\hline
\end{tabular}
\end{scriptsize}}
\subfigure[II Goofspiel (13)]{\label{fig:GS-matches-large}
\begin{scriptsize}
\begin{tabular}{@{}|@{}r@{}|@{~}c@{~}c@{~}c@{~}c@{}|}\hline
1s&OOS&UCT&RM&RND\\\hline
%OOS&49.5(3.1)&25.4(2.7)&21.2(2.5)&72.8(2.7)\\
OOS&49.2(2.5)&28.3(3.9)&35.1(4.2)&83.0(3.2)\\
UCT&69.2(4.0)&70.0(2.8)&61.0(3.0)&90.6(1.8)\\
RM&67.5(4.0)&73.8(2.7)&67.5(2.9)&92.8(1.5)\\
RND&19.6(3.4)&6.2(1.5)&4.9(1.3)&49.0(3.0)\\
\hline
\end{tabular}
\end{scriptsize}}
\subfigure[Liar's Dice (2,2)]{\label{fig:LD-matches-large}
\begin{scriptsize}
\begin{tabular}{@{}|@{}r@{}|@{~}c@{~}c@{~}c@{~}c@{}|}\hline
5s&OOS&UCT&RM&RND\\\hline
OOS&49.9(3.7)&55.4(2.3)&53.3(3.7)&91.3(1.7)\\
UCT&49.8(2.2)&56.2(3.1)&49.8(3.1)&93.7(1.5)\\
RM&53.3(3.5)&51.7(4.0)&51.0(4.0)&88.7(2.0)\\