Refactor into a UCT base class and two child classes

jbradberry · jbradberry · commit 40c1c03e107a · 2016-11-06T20:28:47.000-05:00
The UCT base class contains the common logic of doing the Monte Carlo
Tree Search, while the subclasses customize which values are being
tracked (wins vs. points) and how they are output.
diff --git a/mcts/uct.py b/mcts/uct.py
@@ -13,7 +13,7 @@ def __init__(self, value=0, visits=0):
         self.visits = visits
 
 
-class MonteCarlo(object):
+class UCT(object):
     def __init__(self, board, **kwargs):
         self.board = board
         self.history = []
@@ -69,30 +69,13 @@ def get_action(self):
         print self.data['games'], self.data['time']
         print "Maximum depth searched:", self.max_depth
 
-        actions_states = [(p, self.board.next_state(state, p)) for p in legal]
-
-        # Display the stats for each possible action.
-        self.data['actions'] = sorted(
-            ({'action': p,
-              'percent': 100 * self.stats[(player, S)].value / self.stats[(player, S)].visits,
-              'wins': self.stats[(player, S)].value,
-              'plays': self.stats[(player, S)].visits}
-             for p, S in actions_states),
-            key=lambda x: (x['percent'], x['plays']),
-            reverse=True
-        )
+        # Store and display the stats for each possible action.
+        self.data['actions'] = self.calculate_action_values(state, player, legal)
         for m in self.data['actions']:
-            print "{action}: {percent:.2f}% ({wins} / {plays})".format(**m)
-
-        # Pick the action with the highest percentage of wins.
-        percent_wins, num_actions, action = max(
-            (self.stats[(player, S)].value / self.stats[(player, S)].visits,
-             self.stats[(player, S)].visits,
-             p)
-            for p, S in actions_states
-        )
+            print self.action_template.format(**m)
 
-        return action
+        # Pick the action with the highest average value.
+        return self.data['actions'][0]['action']
 
     def run_simulation(self):
         # Plays out a "random" game from the current position,
@@ -138,78 +121,49 @@ def run_simulation(self):
             visited_states.add((player, state))
 
             player = self.board.current_player(state)
-            winner = self.board.winner(history_copy)
-            if winner:
+            if self.board.is_ended(history_copy):
                 break
 
+        # Back-propagation
+        end_values = self.end_values(history_copy)
         for player, state in visited_states:
             if (player, state) not in stats:
                 continue
             S = stats[(player, state)]
             S.visits += 1
-            if player == winner:
-                S.value += 1
-
-
-class ValueMonteCarlo(object):
-    def __init__(self, board, **kwargs):
-        self.board = board
-        self.history = []
-        self.stats = {}
-
-        self.max_depth = 0
-        self.data = {}
-
-        self.calculation_time = float(kwargs.get('time', 30))
-        self.max_actions = int(kwargs.get('max_actions', 1000))
-
-        # Exploration constant, increase for more exploratory actions,
-        # decrease to prefer actions with known higher win rates.
-        self.C = float(kwargs.get('C', 1.4))
+            S.value += end_values[player]
 
-    def update(self, state):
-        self.history.append(state)
 
-    def display(self, state, action):
-        return self.board.display(state, action)
+class UCTWins(UCT):
+    action_template = "{action}: {percent:.2f}% ({wins} / {plays})"
 
-    def winner_message(self, msg):
-        return self.board.winner_message(msg)
-
-    def get_action(self):
-        # Causes the AI to calculate the best action from the
-        # current game state and return it.
-
-        self.max_depth = 0
-        self.data = {}
+    def __init__(self, board, **kwargs):
+        super(UCTWins, self).__init__(board, **kwargs)
+        self.end_values = board.win_values
 
-        state = self.history[-1]
-        player = self.board.current_player(state)
-        legal = self.board.legal_actions(self.history[:])
+    def calculate_action_values(self, state, player, legal):
+        actions_states = ((p, self.board.next_state(state, p)) for p in legal)
+        return sorted(
+            ({'action': p,
+              'percent': 100 * self.stats[(player, S)].value / self.stats[(player, S)].visits,
+              'wins': self.stats[(player, S)].value,
+              'plays': self.stats[(player, S)].visits}
+             for p, S in actions_states),
+            key=lambda x: (x['percent'], x['plays']),
+            reverse=True
+        )
 
-        # Bail out early if there is no real choice to be made.
-        if not legal:
-            return
-        if len(legal) == 1:
-            return legal[0]
 
-        games = 0
-        begin = time.time()
-        while time.time() - begin < self.calculation_time:
-            self.run_simulation()
-            games += 1
+class UCTValues(UCT):
+    action_template = "{action}: {average:.1f} ({sum} / {plays})"
 
-        # Display the number of calls of `run_simulation` and the
-        # time elapsed.
-        self.data.update(games=games, max_depth=self.max_depth,
-                         time=str(time.time() - begin))
-        print self.data['games'], self.data['time']
-        print "Maximum depth searched:", self.max_depth
-
-        actions_states = [(p, self.board.next_state(state, p)) for p in legal]
+    def __init__(self, board, **kwargs):
+        super(UCTValues, self).__init__(board, **kwargs)
+        self.end_values = board.points_values
 
-        # Display the stats for each possible action.
-        self.data['actions'] = sorted(
+    def calculate_action_values(self, state, player, legal):
+        actions_states = ((p, self.board.next_state(state, p)) for p in legal)
+        return sorted(
             ({'action': p,
               'average': self.stats[(player, S)].value / self.stats[(player, S)].visits,
               'sum': self.stats[(player, S)].value,
@@ -218,75 +172,3 @@ def get_action(self):
             key=lambda x: (x['average'], x['plays']),
             reverse=True
         )
-        for m in self.data['actions']:
-            print "{action}: {average:.1f} ({sum} / {plays})".format(**m)
-
-        # Pick the action with the highest average value.
-        average, num_actions, action = max(
-            (self.stats[(player, S)].value / self.stats[(player, S)].visits,
-             self.stats[(player, S)].visits,
-             p)
-            for p, S in actions_states
-        )
-
-        return action
-
-    def run_simulation(self):
-        # Plays out a "random" game from the current position,
-        # then updates the statistics tables with the result.
-
-        # A bit of an optimization here, so we have a local
-        # variable lookup instead of an attribute access each loop.
-        stats = self.stats
-
-        visited_states = set()
-        history_copy = self.history[:]
-        state = history_copy[-1]
-        player = self.board.current_player(state)
-
-        expand = True
-        for t in xrange(1, self.max_actions + 1):
-            legal = self.board.legal_actions(history_copy)
-            actions_states = [(p, self.board.next_state(state, p)) for p in legal]
-
-            if all((player, S) in stats for p, S in actions_states):
-                # If we have stats on all of the legal actions here, use UCB1.
-                log_total = log(
-                    sum(stats[(player, S)].visits for p, S in actions_states))
-                value, action, state = max(
-                    ((stats[(player, S)].value / stats[(player, S)].visits) +
-                     self.C * sqrt(log_total / stats[(player, S)].visits), p, S)
-                    for p, S in actions_states
-                )
-            else:
-                # Otherwise, just make an arbitrary decision.
-                action, state = choice(actions_states)
-
-            history_copy.append(state)
-
-            # `player` here and below refers to the player
-            # who moved into that particular state.
-            if expand and (player, state) not in stats:
-                expand = False
-                stats[(player, state)] = Stat()
-                if t > self.max_depth:
-                    self.max_depth = t
-
-            visited_states.add((player, state))
-
-            player = self.board.current_player(state)
-            winner = self.board.winner(history_copy)
-            if winner:
-                break
-
-        player_values = {}
-        for player, state in visited_states:
-            if (player, state) not in stats:
-                continue
-            if player not in player_values:
-                player_values[player] = self.board.end_value(history_copy, player)
-
-            S = stats[(player, state)]
-            S.visits += 1
-            if player_values[player] is not None:
-                S.value += player_values[player]
diff --git a/setup.py b/setup.py
@@ -7,8 +7,8 @@
     author_email='jeff.bradberry@gmail.com',
     packages=['mcts'],
     entry_points={
-        'jrb_board.players': ['jrb.mcts.uct = mcts.uct:MonteCarlo',
-                              'jrb.mcts.uctv = mcts.uct:ValueMonteCarlo'],
+        'jrb_board.players': ['jrb.mcts.uct = mcts.uct:UCTWins',
+                              'jrb.mcts.uctv = mcts.uct:UCTValues'],
     },
     license='LICENSE',
     description="An implementation of UCT Monte Carlo Tree Search.",