diff --git a/.github/workflows/action.yaml b/.github/workflows/action.yaml index 4b511d92..94c8785a 100644 --- a/.github/workflows/action.yaml +++ b/.github/workflows/action.yaml @@ -12,7 +12,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: [3.7, 3.8, 3.9] + python-version: ['3.8', '3.9', '3.10'] steps: - name: Checkout uses: actions/checkout@v2 @@ -24,7 +24,6 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt - pip install -r handyrl/envs/kaggle/requirements.txt - name: pytest run: | python -m pytest tests diff --git a/handyrl/agent.py b/handyrl/agent.py index 86d2c08e..44f503ab 100755 --- a/handyrl/agent.py +++ b/handyrl/agent.py @@ -24,7 +24,7 @@ def observe(self, env, player, show=False): class RuleBasedAgent(RandomAgent): def __init__(self, key=None): - self.key = None + self.key = key def action(self, env, player, show=False): if hasattr(env, 'rule_based_action'): diff --git a/handyrl/connection.py b/handyrl/connection.py index 49a176ee..8b01d555 100755 --- a/handyrl/connection.py +++ b/handyrl/connection.py @@ -29,9 +29,6 @@ def close(self): self.conn.close() self.conn = None - def fileno(self): - return self.conn.fileno() - def _recv(self, size): buf = io.BytesIO() while size > 0: diff --git a/handyrl/envs/geister.py b/handyrl/envs/geister.py index a82bd6af..f66d9741 100755 --- a/handyrl/envs/geister.py +++ b/handyrl/envs/geister.py @@ -131,7 +131,8 @@ class GeisterNet(nn.Module): def __init__(self): super().__init__() - layers, filters, p_filters = 3, 32, 8 + layers, filters = 3, 32 + p_filters, v_filters = 8, 2 input_channels = 7 + 18 # board channels + scalar inputs self.input_size = (input_channels, 6, 6) @@ -139,10 +140,10 @@ def __init__(self): self.bn1 = nn.BatchNorm2d(filters) self.body = DRC(layers, filters, filters) - self.head_p_move = Conv2dHead((filters * 2, 6, 6), p_filters, 4) + self.head_p_move = Conv2dHead((filters, 6, 6), p_filters, 4) self.head_p_set = nn.Linear(1, 70, bias=True) - self.head_v = ScalarHead((filters * 2, 6, 6), 1, 1) - self.head_r = ScalarHead((filters * 2, 6, 6), 1, 1) + self.head_v = ScalarHead((filters, 6, 6), v_filters, 1) + self.head_r = ScalarHead((filters, 6, 6), v_filters, 1) def init_hidden(self, batch_size=[]): return self.body.init_hidden(self.input_size[1:], batch_size) @@ -154,7 +155,6 @@ def forward(self, x, hidden): h_e = F.relu(self.bn1(self.conv1(h))) h, hidden = self.body(h_e, hidden, num_repeats=3) - h = torch.cat([h_e, h], -3) h_p_move = self.head_p_move(h) turn_color = s[:, :1] @@ -189,10 +189,11 @@ class Environment(BaseEnvironment): def __init__(self, args=None): super().__init__() + self.args = args if args is not None else {} self.reset() - def reset(self, args={}): - self.args = args + def reset(self, args=None): + self.game_args = args if args is not None else {} self.board = -np.ones((6, 6), dtype=np.int32) # (x, y) -1 is empty self.color = self.BLACK self.turn_count = -2 # before setting original positions @@ -343,8 +344,9 @@ def _piece(p): s = ' ' + ' '.join(self.Y) + '\n' for i in range(6): s += self.X[i] + ' ' + ' '.join([self.P[_piece(self.board[i, j])] for j in range(6)]) + '\n' - s += 'color = ' + self.C[self.color] + '\n' - s += 'record = ' + self.record_string() + s += 'remained = B:%d R:%d b:%d r:%d' % tuple(self.piece_cnt) + '\n' + s += 'turn = ' + str(self.turn_count).ljust(3) + ' color = ' + self.C[self.color] + # s += 'record = ' + self.record_string() return s def _set(self, layout): @@ -409,7 +411,7 @@ def diff_info(self, player): def update(self, info, reset): if reset: - self.args = {**self.args, **info} + self.game_args = {**self.game_args, **info} self.reset(info) elif 'set' in info: self._set(info['set']) diff --git a/handyrl/evaluation.py b/handyrl/evaluation.py index 248f5b6c..45c4d225 100755 --- a/handyrl/evaluation.py +++ b/handyrl/evaluation.py @@ -58,7 +58,8 @@ def run(self): reset = args[1] if reset: self.agent.reset(self.env, show=True) - view_transition(self.env) + else: + view_transition(self.env) self.conn.send(ret) @@ -105,7 +106,7 @@ def exec_match(env, agents, critic=None, show=False, game_args={}): outcome = env.outcome() if show: print('final outcome = %s' % outcome) - return outcome + return {'result': outcome} def exec_network_match(env, network_agents, critic=None, show=False, game_args={}): @@ -137,7 +138,7 @@ def exec_network_match(env, network_agents, critic=None, show=False, game_args={ outcome = env.outcome() for p, agent in network_agents.items(): agent.outcome(outcome[p]) - return outcome + return {'result': outcome} def build_agent(raw, env=None): @@ -169,11 +170,11 @@ def execute(self, models, args): else: agents[p] = Agent(model) - outcome = exec_match(self.env, agents) - if outcome is None: + results = exec_match(self.env, agents) + if results is None: print('None episode in evaluation!') return None - return {'args': args, 'result': outcome, 'opponent': opponent} + return {'args': args, 'opponent': opponent, **results} def wp_func(results): @@ -195,10 +196,10 @@ def eval_process_mp_child(agents, critic, env_args, index, in_queue, out_queue, print('*** Game %d ***' % g) agent_map = {env.players()[p]: agents[ai] for p, ai in enumerate(agent_ids)} if isinstance(list(agent_map.values())[0], NetworkAgent): - outcome = exec_network_match(env, agent_map, critic, show=show, game_args=game_args) + results = exec_network_match(env, agent_map, critic, show=show, game_args=game_args) else: - outcome = exec_match(env, agent_map, critic, show=show, game_args=game_args) - out_queue.put((pat_idx, agent_ids, outcome)) + results = exec_match(env, agent_map, critic, show=show, game_args=game_args) + out_queue.put((pat_idx, agent_ids, results)) out_queue.put(None) @@ -245,7 +246,8 @@ def evaluate_mp(env, agents, critic, env_args, args_patterns, num_process, num_g if ret is None: finished_cnt += 1 continue - pat_idx, agent_ids, outcome = ret + pat_idx, agent_ids, results = ret + outcome = results.get('result') if outcome is not None: for idx, p in enumerate(env.players()): agent_id = agent_ids[idx] @@ -378,14 +380,18 @@ def eval_main(args, argv): prepare_env(env_args) env = make_env(env_args) - model_path = argv[0] if len(argv) >= 1 else 'models/latest.pth' + model_paths = argv[0].split(':') if len(argv) >= 1 else ['models/latest.pth'] num_games = int(argv[1]) if len(argv) >= 2 else 100 num_process = int(argv[2]) if len(argv) >= 3 else 1 - agent1 = build_agent(model_path, env) - if agent1 is None: - model = load_model(model_path, env.net()) - agent1 = Agent(model) + def resolve_agent(model_path): + agent = build_agent(model_path, env) + if agent is None: + model = load_model(model_path, env.net()) + agent = Agent(model) + return agent + + main_agent = resolve_agent(model_paths[0]) critic = None print('%d process, %d games' % (num_process, num_games)) @@ -393,7 +399,8 @@ def eval_main(args, argv): seed = random.randrange(1e8) print('seed = %d' % seed) - agents = [agent1] + [RandomAgent() for _ in range(len(env.players()) - 1)] + opponent = model_paths[1] if len(model_paths) > 1 else 'random' + agents = [main_agent] + [resolve_agent(opponent) for _ in range(len(env.players()) - 1)] evaluate_mp(env, agents, critic, env_args, {'default': {}}, num_process, num_games, seed) diff --git a/handyrl/losses.py b/handyrl/losses.py index 2e2f0da3..16e3d69c 100755 --- a/handyrl/losses.py +++ b/handyrl/losses.py @@ -17,30 +17,32 @@ def monte_carlo(values, returns): return returns, returns - values -def temporal_difference(values, returns, rewards, lmb, gamma): +def temporal_difference(values, returns, rewards, lambda_, gamma): target_values = deque([returns[:, -1]]) for i in range(values.size(1) - 2, -1, -1): reward = rewards[:, i] if rewards is not None else 0 - target_values.appendleft(reward + gamma * ((1 - lmb) * values[:, i + 1] + lmb * target_values[0])) + lamb = lambda_[:, i + 1] + target_values.appendleft(reward + gamma * ((1 - lamb) * values[:, i + 1] + lamb * target_values[0])) target_values = torch.stack(tuple(target_values), dim=1) return target_values, target_values - values -def upgo(values, returns, rewards, lmb, gamma): +def upgo(values, returns, rewards, lambda_, gamma): target_values = deque([returns[:, -1]]) for i in range(values.size(1) - 2, -1, -1): value = values[:, i + 1] reward = rewards[:, i] if rewards is not None else 0 - target_values.appendleft(reward + gamma * torch.max(value, (1 - lmb) * value + lmb * target_values[0])) + lamb = lambda_[:, i + 1] + target_values.appendleft(reward + gamma * torch.max(value, (1 - lamb) * value + lamb * target_values[0])) target_values = torch.stack(tuple(target_values), dim=1) return target_values, target_values - values -def vtrace(values, returns, rewards, lmb, gamma, rhos, cs): +def vtrace(values, returns, rewards, lambda_, gamma, rhos, cs): rewards = rewards if rewards is not None else 0 values_t_plus_1 = torch.cat([values[:, 1:], returns[:, -1:]], dim=1) deltas = rhos * (rewards + gamma * values_t_plus_1 - values) @@ -48,7 +50,7 @@ def vtrace(values, returns, rewards, lmb, gamma, rhos, cs): # compute Vtrace value target recursively vs_minus_v_xs = deque([deltas[:, -1]]) for i in range(values.size(1) - 2, -1, -1): - vs_minus_v_xs.appendleft(deltas[:, i] + gamma * lmb * cs[:, i] * vs_minus_v_xs[0]) + vs_minus_v_xs.appendleft(deltas[:, i] + gamma * lambda_[:, i + 1] * cs[:, i] * vs_minus_v_xs[0]) vs_minus_v_xs = torch.stack(tuple(vs_minus_v_xs), dim=1) vs = vs_minus_v_xs + values @@ -58,18 +60,21 @@ def vtrace(values, returns, rewards, lmb, gamma, rhos, cs): return vs, advantages -def compute_target(algorithm, values, returns, rewards, lmb, gamma, rhos, cs): +def compute_target(algorithm, values, returns, rewards, lmb, gamma, rhos, cs, masks): if values is None: # In the absence of a baseline, Monte Carlo returns are used. return returns, returns if algorithm == 'MC': return monte_carlo(values, returns) - elif algorithm == 'TD': - return temporal_difference(values, returns, rewards, lmb, gamma) + + lambda_ = lmb + (1 - lmb) * (1 - masks) + + if algorithm == 'TD': + return temporal_difference(values, returns, rewards, lambda_, gamma) elif algorithm == 'UPGO': - return upgo(values, returns, rewards, lmb, gamma) + return upgo(values, returns, rewards, lambda_, gamma) elif algorithm == 'VTRACE': - return vtrace(values, returns, rewards, lmb, gamma, rhos, cs) + return vtrace(values, returns, rewards, lambda_, gamma, rhos, cs) else: print('No algorithm named %s' % algorithm) diff --git a/handyrl/model.py b/handyrl/model.py index b4dd2a7a..d59bde82 100755 --- a/handyrl/model.py +++ b/handyrl/model.py @@ -70,5 +70,5 @@ def __init__(self, model, x): outputs = wrapped_model.inference(x, hidden) self.output_dict = {key: np.zeros_like(value) for key, value in outputs.items() if key != 'hidden'} - def inference(self, *args): + def inference(self, *args, **kwargs): return self.output_dict diff --git a/handyrl/train.py b/handyrl/train.py index 05fb2b9d..7dad705f 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -63,24 +63,23 @@ def replace_none(a, b): # data that is changed by training configuration if args['turn_based_training'] and not args['observation']: - obs = [[m['observation'][m['turn'][0]]] for m in moments] - prob = np.array([[[m['selected_prob'][m['turn'][0]]]] for m in moments]) - act = np.array([[m['action'][m['turn'][0]]] for m in moments], dtype=np.int64)[..., np.newaxis] - amask = np.array([[m['action_mask'][m['turn'][0]]] for m in moments]) + players_list = [[m['turn'][0]] for m in moments] else: - obs = [[replace_none(m['observation'][player], obs_zeros) for player in players] for m in moments] - prob = np.array([[[replace_none(m['selected_prob'][player], 1.0)] for player in players] for m in moments]) - act = np.array([[replace_none(m['action'][player], 0) for player in players] for m in moments], dtype=np.int64)[..., np.newaxis] - amask = np.array([[replace_none(m['action_mask'][player], amask_zeros + 1e32) for player in players] for m in moments]) + players_list = [players for m in moments] + + obs = [[replace_none(m['observation'][player], obs_zeros) for player in players_] for m, players_ in zip(moments, players_list)] + prob = np.array([[[replace_none(m['selected_prob'][player], 1.0)] for player in players_] for m, players_ in zip(moments, players_list)]) + act = np.array([[replace_none(m['action'][player], 0) for player in players_] for m, players_ in zip(moments, players_list)], dtype=np.int64)[..., np.newaxis] + amask = np.array([[replace_none(m['action_mask'][player], amask_zeros + 1e32) for player in players_] for m, players_ in zip(moments, players_list)]) # reshape observation obs = rotate(rotate(obs)) # (T, P, ..., ...) -> (P, ..., T, ...) -> (..., T, P, ...) obs = bimap_r(obs_zeros, obs, lambda _, o: np.array(o)) # datum that is not changed by training configuration - v = np.array([[replace_none(m['value'][player], [0]) for player in players] for m in moments], dtype=np.float32).reshape(len(moments), len(players), -1) - rew = np.array([[replace_none(m['reward'][player], [0]) for player in players] for m in moments], dtype=np.float32).reshape(len(moments), len(players), -1) - ret = np.array([[replace_none(m['return'][player], [0]) for player in players] for m in moments], dtype=np.float32).reshape(len(moments), len(players), -1) + v = np.array([[replace_none(m['value'][player], 0) for player in players] for m in moments], dtype=np.float32).reshape(len(moments), len(players), -1) + rew = np.array([[replace_none(m['reward'][player], 0) for player in players] for m in moments], dtype=np.float32).reshape(len(moments), len(players), -1) + ret = np.array([[replace_none(m['return'][player], 0) for player in players] for m in moments], dtype=np.float32).reshape(len(moments), len(players), -1) oc = np.array([ep['outcome'][player] for player in players], dtype=np.float32).reshape(1, len(players), -1) emask = np.ones((len(moments), 1, 1), dtype=np.float32) # episode mask @@ -224,6 +223,9 @@ def compute_loss(batch, model, hidden, args): actions = batch['action'] emasks = batch['episode_mask'] + omasks = batch['observation_mask'] + value_target_masks, return_target_masks = omasks, omasks + clip_rho_threshold, clip_c_threshold = 1.0, 1.0 log_selected_b_policies = torch.log(torch.clamp(batch['selected_prob'], 1e-16, 1)) * emasks @@ -239,16 +241,18 @@ def compute_loss(batch, model, hidden, args): if 'value' in outputs_nograd: values_nograd = outputs_nograd['value'] if args['turn_based_training'] and values_nograd.size(2) == 2: # two player zerosum game - values_nograd_opponent = -torch.stack([values_nograd[:, :, 1], values_nograd[:, :, 0]], dim=2) - values_nograd = (values_nograd + values_nograd_opponent) / (batch['observation_mask'].sum(dim=2, keepdim=True) + 1e-8) + values_nograd_opponent = -torch.flip(values_nograd, dims=[2]) + omasks_opponent = torch.flip(omasks, dims=[2]) + values_nograd = (values_nograd * omasks + values_nograd_opponent * omasks_opponent) / (omasks + omasks_opponent + 1e-8) + value_target_masks = torch.clamp(omasks + omasks_opponent, 0, 1) outputs_nograd['value'] = values_nograd * emasks + batch['outcome'] * (1 - emasks) # compute targets and advantage targets = {} advantages = {} - value_args = outputs_nograd.get('value', None), batch['outcome'], None, args['lambda'], 1, clipped_rhos, cs - return_args = outputs_nograd.get('return', None), batch['return'], batch['reward'], args['lambda'], args['gamma'], clipped_rhos, cs + value_args = outputs_nograd.get('value', None), batch['outcome'], None, args['lambda'], 1, clipped_rhos, cs, value_target_masks + return_args = outputs_nograd.get('return', None), batch['return'], batch['reward'], args['lambda'], args['gamma'], clipped_rhos, cs, return_target_masks targets['value'], advantages['value'] = compute_target(args['value_target'], *value_args) targets['return'], advantages['return'] = compute_target(args['value_target'], *return_args) @@ -286,11 +290,16 @@ def run(self): def select_episode(self): while True: - ep_idx = random.randrange(min(len(self.episodes), self.args['maximum_episodes'])) - accept_rate = 1 - (len(self.episodes) - 1 - ep_idx) / self.args['maximum_episodes'] - if random.random() < accept_rate: + ep_count = min(len(self.episodes), self.args['maximum_episodes']) + ep_idx = random.randrange(ep_count) + accept_rate = 1 - (ep_count - 1 - ep_idx) / ep_count + if random.random() >= accept_rate: + continue + try: + ep = self.episodes[ep_idx] break - ep = self.episodes[ep_idx] + except IndexError: + continue turn_candidates = 1 + max(0, ep['steps'] - self.args['forward_steps']) # change start turn by sequence length train_st = random.randrange(turn_candidates) st = max(0, train_st - self.args['burn_in_steps']) @@ -427,7 +436,7 @@ def __init__(self, args, net=None, remote=False): self.worker = WorkerServer(args) if remote else WorkerCluster(args) # thread connection - self.trainer = Trainer(args, self.model) + self.trainer = Trainer(args, copy.deepcopy(self.model)) def model_path(self, model_id): return os.path.join('models', str(model_id) + '.pth') diff --git a/scripts/win_rate_plot.py b/scripts/win_rate_plot.py index cf18878e..8f7e4e8e 100644 --- a/scripts/win_rate_plot.py +++ b/scripts/win_rate_plot.py @@ -87,8 +87,9 @@ def get_wp_list(path): for opponent in opponents: wp_list = averaged_wp_lists[opponent] start = start_epoch[opponent] - # ax.plot(clipped_epoch_list[start:], wp_list[start:], label=opponent) - ax.plot(clipped_game_list[start:], wp_list[start:], label=opponent) + end = min(min(len(clipped_epoch_list), len(clipped_game_list)), len(wp_list)) + # ax.plot(clipped_epoch_list[start:end], wp_list[start:end], label=opponent) + ax.plot(clipped_game_list[start:end], wp_list[start:end], label=opponent) last_win_rate[opponent] = wp_list[-1] ax.set_xlabel('Games', size=14) diff --git a/tests/test_environment.py b/tests/test_environment.py index 14ff9f64..5f0e9c23 100644 --- a/tests/test_environment.py +++ b/tests/test_environment.py @@ -8,7 +8,7 @@ 'tictactoe', 'geister', 'parallel_tictactoe', - 'kaggle.hungry_geese', + # 'kaggle.hungry_geese', ]