From 80b54fea2f0404a2358eac6d3a59828f8a529475 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Mon, 31 Jan 2022 09:06:31 +0900 Subject: [PATCH 01/26] feature: apply lambda=1 in the timestep that there is no value output --- handyrl/losses.py | 27 ++++++++++++++++----------- handyrl/train.py | 7 +++++-- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/handyrl/losses.py b/handyrl/losses.py index 2e2f0da3..326d6639 100755 --- a/handyrl/losses.py +++ b/handyrl/losses.py @@ -17,30 +17,32 @@ def monte_carlo(values, returns): return returns, returns - values -def temporal_difference(values, returns, rewards, lmb, gamma): +def temporal_difference(values, returns, rewards, lambda_, gamma): target_values = deque([returns[:, -1]]) for i in range(values.size(1) - 2, -1, -1): reward = rewards[:, i] if rewards is not None else 0 - target_values.appendleft(reward + gamma * ((1 - lmb) * values[:, i + 1] + lmb * target_values[0])) + lamb = lambda_[:, i + 1] + target_values.appendleft(reward + gamma * ((1 - lamb) * values[:, i + 1] + lamb * target_values[0])) target_values = torch.stack(tuple(target_values), dim=1) return target_values, target_values - values -def upgo(values, returns, rewards, lmb, gamma): +def upgo(values, returns, rewards, lambda_, gamma): target_values = deque([returns[:, -1]]) for i in range(values.size(1) - 2, -1, -1): value = values[:, i + 1] reward = rewards[:, i] if rewards is not None else 0 - target_values.appendleft(reward + gamma * torch.max(value, (1 - lmb) * value + lmb * target_values[0])) + lamb = lambda_[:, i + 1] + target_values.appendleft(reward + gamma * torch.max(value, (1 - lamb) * value + lamb * target_values[0])) target_values = torch.stack(tuple(target_values), dim=1) return target_values, target_values - values -def vtrace(values, returns, rewards, lmb, gamma, rhos, cs): +def vtrace(values, returns, rewards, lambda_, gamma, rhos, cs): rewards = rewards if rewards is not None else 0 values_t_plus_1 = torch.cat([values[:, 1:], returns[:, -1:]], dim=1) deltas = rhos * (rewards + gamma * values_t_plus_1 - values) @@ -48,7 +50,7 @@ def vtrace(values, returns, rewards, lmb, gamma, rhos, cs): # compute Vtrace value target recursively vs_minus_v_xs = deque([deltas[:, -1]]) for i in range(values.size(1) - 2, -1, -1): - vs_minus_v_xs.appendleft(deltas[:, i] + gamma * lmb * cs[:, i] * vs_minus_v_xs[0]) + vs_minus_v_xs.appendleft(deltas[:, i] + gamma * lambda_[:, i + 1] * cs[:, i] * vs_minus_v_xs[0]) vs_minus_v_xs = torch.stack(tuple(vs_minus_v_xs), dim=1) vs = vs_minus_v_xs + values @@ -58,18 +60,21 @@ def vtrace(values, returns, rewards, lmb, gamma, rhos, cs): return vs, advantages -def compute_target(algorithm, values, returns, rewards, lmb, gamma, rhos, cs): +def compute_target(algorithm, values, returns, rewards, lmb, gamma, rhos, cs, masks): if values is None: # In the absence of a baseline, Monte Carlo returns are used. return returns, returns if algorithm == 'MC': return monte_carlo(values, returns) - elif algorithm == 'TD': - return temporal_difference(values, returns, rewards, lmb, gamma) + + lambda_ = lmb + (1 - lmb) * masks + + if algorithm == 'TD': + return temporal_difference(values, returns, rewards, lambda_, gamma) elif algorithm == 'UPGO': - return upgo(values, returns, rewards, lmb, gamma) + return upgo(values, returns, rewards, lambda_, gamma) elif algorithm == 'VTRACE': - return vtrace(values, returns, rewards, lmb, gamma, rhos, cs) + return vtrace(values, returns, rewards, lambda_, gamma, rhos, cs) else: print('No algorithm named %s' % algorithm) diff --git a/handyrl/train.py b/handyrl/train.py index 031baa35..b3e4e061 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -219,6 +219,7 @@ def compute_loss(batch, model, hidden, args): actions = batch['action'] emasks = batch['episode_mask'] + target_masks = batch['observation_mask'] clip_rho_threshold, clip_c_threshold = 1.0, 1.0 log_selected_b_policies = torch.log(torch.clamp(batch['selected_prob'], 1e-16, 1)) * emasks @@ -236,14 +237,16 @@ def compute_loss(batch, model, hidden, args): if args['turn_based_training'] and values_nograd.size(2) == 2: # two player zerosum game values_nograd_opponent = -torch.stack([values_nograd[:, :, 1], values_nograd[:, :, 0]], dim=2) values_nograd = (values_nograd + values_nograd_opponent) / (batch['observation_mask'].sum(dim=2, keepdim=True) + 1e-8) + target_masks_inv = torch.stack([batch['observation_mask'][:, :, 1], values_nograd[:, :, 0]], dim=2) + target_masks = torch.clamp(target_masks + target_masks_inv, 0, 1) outputs_nograd['value'] = values_nograd * emasks + batch['outcome'] * (1 - emasks) # compute targets and advantage targets = {} advantages = {} - value_args = outputs_nograd.get('value', None), batch['outcome'], None, args['lambda'], 1, clipped_rhos, cs - return_args = outputs_nograd.get('return', None), batch['return'], batch['reward'], args['lambda'], args['gamma'], clipped_rhos, cs + value_args = outputs_nograd.get('value', None), batch['outcome'], None, args['lambda'], 1, clipped_rhos, cs, target_masks + return_args = outputs_nograd.get('return', None), batch['return'], batch['reward'], args['lambda'], args['gamma'], clipped_rhos, cs, target_masks targets['value'], advantages['value'] = compute_target(args['value_target'], *value_args) targets['return'], advantages['return'] = compute_target(args['value_target'], *return_args) From 31b8fb9b486ee6db93bef125da8197fcb1e328d1 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sat, 26 Feb 2022 03:45:40 +0900 Subject: [PATCH 02/26] chore: remove fileno() inferface from PickledConnection --- handyrl/connection.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/handyrl/connection.py b/handyrl/connection.py index 3550f153..8880470b 100755 --- a/handyrl/connection.py +++ b/handyrl/connection.py @@ -29,9 +29,6 @@ def close(self): self.conn.close() self.conn = None - def fileno(self): - return self.conn.fileno() - def _recv(self, size): buf = io.BytesIO() while size > 0: From bcf7c4bb799be03c204e5263a7c0c14834a07ecf Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 10 Mar 2022 20:29:32 +0900 Subject: [PATCH 03/26] fix: reversed target mask --- handyrl/losses.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/losses.py b/handyrl/losses.py index 326d6639..16e3d69c 100755 --- a/handyrl/losses.py +++ b/handyrl/losses.py @@ -68,7 +68,7 @@ def compute_target(algorithm, values, returns, rewards, lmb, gamma, rhos, cs, ma if algorithm == 'MC': return monte_carlo(values, returns) - lambda_ = lmb + (1 - lmb) * masks + lambda_ = lmb + (1 - lmb) * (1 - masks) if algorithm == 'TD': return temporal_difference(values, returns, rewards, lambda_, gamma) From 5da6f59871853048b943c3067a4640a685f05bcf Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 10 Mar 2022 20:30:40 +0900 Subject: [PATCH 04/26] fix: value_target_masks and return_target_masks should be different --- handyrl/train.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/handyrl/train.py b/handyrl/train.py index b3e4e061..aecd6528 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -219,7 +219,7 @@ def compute_loss(batch, model, hidden, args): actions = batch['action'] emasks = batch['episode_mask'] - target_masks = batch['observation_mask'] + value_target_masks, return_target_masks = batch['observation_mask'], batch['observation_mask'] clip_rho_threshold, clip_c_threshold = 1.0, 1.0 log_selected_b_policies = torch.log(torch.clamp(batch['selected_prob'], 1e-16, 1)) * emasks @@ -237,16 +237,16 @@ def compute_loss(batch, model, hidden, args): if args['turn_based_training'] and values_nograd.size(2) == 2: # two player zerosum game values_nograd_opponent = -torch.stack([values_nograd[:, :, 1], values_nograd[:, :, 0]], dim=2) values_nograd = (values_nograd + values_nograd_opponent) / (batch['observation_mask'].sum(dim=2, keepdim=True) + 1e-8) - target_masks_inv = torch.stack([batch['observation_mask'][:, :, 1], values_nograd[:, :, 0]], dim=2) - target_masks = torch.clamp(target_masks + target_masks_inv, 0, 1) + value_target_masks_inv = torch.stack([value_target_masks[:, :, 1], value_target_masks[:, :, 0]], dim=2) + value_target_masks = torch.clamp(value_target_masks + value_target_masks_inv, 0, 1) outputs_nograd['value'] = values_nograd * emasks + batch['outcome'] * (1 - emasks) # compute targets and advantage targets = {} advantages = {} - value_args = outputs_nograd.get('value', None), batch['outcome'], None, args['lambda'], 1, clipped_rhos, cs, target_masks - return_args = outputs_nograd.get('return', None), batch['return'], batch['reward'], args['lambda'], args['gamma'], clipped_rhos, cs, target_masks + value_args = outputs_nograd.get('value', None), batch['outcome'], None, args['lambda'], 1, clipped_rhos, cs, value_target_masks + return_args = outputs_nograd.get('return', None), batch['return'], batch['reward'], args['lambda'], args['gamma'], clipped_rhos, cs, return_target_masks targets['value'], advantages['value'] = compute_target(args['value_target'], *value_args) targets['return'], advantages['return'] = compute_target(args['value_target'], *return_args) From f6aee6c6ea4a3a6d51efed50bf2c9e5b4bbbab0f Mon Sep 17 00:00:00 2001 From: YuriCat Date: Wed, 23 Mar 2022 08:15:27 +0900 Subject: [PATCH 05/26] feature: update geister board view --- handyrl/envs/geister.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/handyrl/envs/geister.py b/handyrl/envs/geister.py index a82bd6af..255a41a3 100755 --- a/handyrl/envs/geister.py +++ b/handyrl/envs/geister.py @@ -343,8 +343,9 @@ def _piece(p): s = ' ' + ' '.join(self.Y) + '\n' for i in range(6): s += self.X[i] + ' ' + ' '.join([self.P[_piece(self.board[i, j])] for j in range(6)]) + '\n' - s += 'color = ' + self.C[self.color] + '\n' - s += 'record = ' + self.record_string() + s += 'remained = B:%d R:%d b:%d r:%d' % tuple(self.piece_cnt) + '\n' + s += 'turn = ' + str(self.turn_count).ljust(3) + ' color = ' + self.C[self.color] + # s += 'record = ' + self.record_string() return s def _set(self, layout): From 400afaced20fe0b807ccaea58d50e9084d1bf47b Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 24 Mar 2022 20:50:45 +0900 Subject: [PATCH 06/26] feature: divide args and game_args in Geister environment --- handyrl/envs/geister.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/handyrl/envs/geister.py b/handyrl/envs/geister.py index 255a41a3..69a7f88e 100755 --- a/handyrl/envs/geister.py +++ b/handyrl/envs/geister.py @@ -189,10 +189,11 @@ class Environment(BaseEnvironment): def __init__(self, args=None): super().__init__() + self.args = args if args is not None else {} self.reset() - def reset(self, args={}): - self.args = args + def reset(self, args=None): + self.game_args = args if args is not None else {} self.board = -np.ones((6, 6), dtype=np.int32) # (x, y) -1 is empty self.color = self.BLACK self.turn_count = -2 # before setting original positions @@ -410,7 +411,7 @@ def diff_info(self, player): def update(self, info, reset): if reset: - self.args = {**self.args, **info} + self.game_args = {**self.game_args, **info} self.reset(info) elif 'set' in info: self._set(info['set']) From 834a8fa41000091c94e384cb13293510ae038102 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Wed, 27 Apr 2022 18:40:43 +0900 Subject: [PATCH 07/26] fix: set key for rule-based agents --- handyrl/agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/agent.py b/handyrl/agent.py index 86d2c08e..44f503ab 100755 --- a/handyrl/agent.py +++ b/handyrl/agent.py @@ -24,7 +24,7 @@ def observe(self, env, player, show=False): class RuleBasedAgent(RandomAgent): def __init__(self, key=None): - self.key = None + self.key = key def action(self, env, player, show=False): if hasattr(env, 'rule_based_action'): From 6aadc8cde516151da31e9f713af3c7e279ece81b Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sat, 30 Apr 2022 22:17:22 +0900 Subject: [PATCH 08/26] feature: update geister net v_filters 1 -> 2 --- handyrl/envs/geister.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/handyrl/envs/geister.py b/handyrl/envs/geister.py index 69a7f88e..fb8254ec 100755 --- a/handyrl/envs/geister.py +++ b/handyrl/envs/geister.py @@ -131,7 +131,8 @@ class GeisterNet(nn.Module): def __init__(self): super().__init__() - layers, filters, p_filters = 3, 32, 8 + layers, filters = 3, 32 + p_filters, v_filters = 8, 2 input_channels = 7 + 18 # board channels + scalar inputs self.input_size = (input_channels, 6, 6) @@ -141,8 +142,8 @@ def __init__(self): self.head_p_move = Conv2dHead((filters * 2, 6, 6), p_filters, 4) self.head_p_set = nn.Linear(1, 70, bias=True) - self.head_v = ScalarHead((filters * 2, 6, 6), 1, 1) - self.head_r = ScalarHead((filters * 2, 6, 6), 1, 1) + self.head_v = ScalarHead((filters * 2, 6, 6), v_filters, 1) + self.head_r = ScalarHead((filters * 2, 6, 6), v_filters, 1) def init_hidden(self, batch_size=[]): return self.body.init_hidden(self.input_size[1:], batch_size) From 13f9b12ed3ef21c7deeb085b6c5e98b92d788018 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Mon, 2 May 2022 01:30:59 +0900 Subject: [PATCH 09/26] feature: remove skip connection to heads in geister net --- handyrl/envs/geister.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/handyrl/envs/geister.py b/handyrl/envs/geister.py index fb8254ec..f66d9741 100755 --- a/handyrl/envs/geister.py +++ b/handyrl/envs/geister.py @@ -140,10 +140,10 @@ def __init__(self): self.bn1 = nn.BatchNorm2d(filters) self.body = DRC(layers, filters, filters) - self.head_p_move = Conv2dHead((filters * 2, 6, 6), p_filters, 4) + self.head_p_move = Conv2dHead((filters, 6, 6), p_filters, 4) self.head_p_set = nn.Linear(1, 70, bias=True) - self.head_v = ScalarHead((filters * 2, 6, 6), v_filters, 1) - self.head_r = ScalarHead((filters * 2, 6, 6), v_filters, 1) + self.head_v = ScalarHead((filters, 6, 6), v_filters, 1) + self.head_r = ScalarHead((filters, 6, 6), v_filters, 1) def init_hidden(self, batch_size=[]): return self.body.init_hidden(self.input_size[1:], batch_size) @@ -155,7 +155,6 @@ def forward(self, x, hidden): h_e = F.relu(self.bn1(self.conv1(h))) h, hidden = self.body(h_e, hidden, num_repeats=3) - h = torch.cat([h_e, h], -3) h_p_move = self.head_p_move(h) turn_color = s[:, :1] From 4160315be47caa78845b7c7d685489f61cd9b239 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 12 May 2022 20:12:12 +0900 Subject: [PATCH 10/26] feature: opponent selction by : --- handyrl/evaluation.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/handyrl/evaluation.py b/handyrl/evaluation.py index 248f5b6c..18a39b21 100755 --- a/handyrl/evaluation.py +++ b/handyrl/evaluation.py @@ -378,14 +378,18 @@ def eval_main(args, argv): prepare_env(env_args) env = make_env(env_args) - model_path = argv[0] if len(argv) >= 1 else 'models/latest.pth' + model_paths = argv[0].split(':') if len(argv) >= 1 else ['models/latest.pth'] num_games = int(argv[1]) if len(argv) >= 2 else 100 num_process = int(argv[2]) if len(argv) >= 3 else 1 - agent1 = build_agent(model_path, env) - if agent1 is None: - model = load_model(model_path, env.net()) - agent1 = Agent(model) + def resolve_agent(model_path): + agent = build_agent(model_path, env) + if agent is None: + model = load_model(model_path, env.net()) + agent = Agent(model) + return agent + + main_agent = resolve_agent(model_paths[0]) critic = None print('%d process, %d games' % (num_process, num_games)) @@ -393,7 +397,8 @@ def eval_main(args, argv): seed = random.randrange(1e8) print('seed = %d' % seed) - agents = [agent1] + [RandomAgent() for _ in range(len(env.players()) - 1)] + opponent = model_paths[1] if len(model_paths) > 1 else 'random' + agents = [main_agent] + [resolve_agent(opponent) for _ in range(len(env.players()) - 1)] evaluate_mp(env, agents, critic, env_args, {'default': {}}, num_process, num_games, seed) From 30ca00cd79c74c35c02d6afe7b2a37ddcdb360cd Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 26 May 2022 23:47:14 +0900 Subject: [PATCH 11/26] feature: divide ep count variable --- handyrl/train.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/handyrl/train.py b/handyrl/train.py index 05fb2b9d..75c0a31b 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -286,8 +286,9 @@ def run(self): def select_episode(self): while True: - ep_idx = random.randrange(min(len(self.episodes), self.args['maximum_episodes'])) - accept_rate = 1 - (len(self.episodes) - 1 - ep_idx) / self.args['maximum_episodes'] + ep_count = min(len(self.episodes), self.args['maximum_episodes']) + ep_idx = random.randrange(ep_count) + accept_rate = 1 - (ep_count - 1 - ep_idx) / self.args['maximum_episodes'] if random.random() < accept_rate: break ep = self.episodes[ep_idx] From bcc4d7af846280cbb04369d542a353d164d11c42 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Sat, 28 May 2022 07:36:58 +0900 Subject: [PATCH 12/26] fix: stop calling view_transition() in reset phase --- handyrl/evaluation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/handyrl/evaluation.py b/handyrl/evaluation.py index 248f5b6c..8783ce86 100755 --- a/handyrl/evaluation.py +++ b/handyrl/evaluation.py @@ -58,7 +58,8 @@ def run(self): reset = args[1] if reset: self.agent.reset(self.env, show=True) - view_transition(self.env) + else: + view_transition(self.env) self.conn.send(ret) From 1f8aa05d7327113cdb74d1889dc98a5222d474a4 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Mon, 13 Jun 2022 23:38:11 +0900 Subject: [PATCH 13/26] chore: add kwargs to random model --- handyrl/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/model.py b/handyrl/model.py index b4dd2a7a..d59bde82 100755 --- a/handyrl/model.py +++ b/handyrl/model.py @@ -70,5 +70,5 @@ def __init__(self, model, x): outputs = wrapped_model.inference(x, hidden) self.output_dict = {key: np.zeros_like(value) for key, value in outputs.items() if key != 'hidden'} - def inference(self, *args): + def inference(self, *args, **kwargs): return self.output_dict From 58737c61f325340ef610822d39a9bb06e92929d9 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Wed, 6 Jul 2022 23:21:14 +0900 Subject: [PATCH 14/26] feature: add python3.10 test --- .github/workflows/action.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/action.yaml b/.github/workflows/action.yaml index 4b511d92..698e729f 100644 --- a/.github/workflows/action.yaml +++ b/.github/workflows/action.yaml @@ -12,7 +12,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: [3.7, 3.8, 3.9] + python-version: ['3.7', '3.8', '3.9', '3.10'] steps: - name: Checkout uses: actions/checkout@v2 From e959c6166909bb6a0a1d7ae4855dc740326fda45 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 26 Jul 2022 16:47:01 +0900 Subject: [PATCH 15/26] feature: proportional accept rate during all phases --- handyrl/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/train.py b/handyrl/train.py index 75c0a31b..fe8ac0d4 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -288,7 +288,7 @@ def select_episode(self): while True: ep_count = min(len(self.episodes), self.args['maximum_episodes']) ep_idx = random.randrange(ep_count) - accept_rate = 1 - (ep_count - 1 - ep_idx) / self.args['maximum_episodes'] + accept_rate = 1 - (ep_count - 1 - ep_idx) / ep_count if random.random() < accept_rate: break ep = self.episodes[ep_idx] From 9909c4138d8ba2101997b670f5f941b2859cee69 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 24 Nov 2022 16:37:42 +0900 Subject: [PATCH 16/26] fix: episode selection index error --- handyrl/train.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/handyrl/train.py b/handyrl/train.py index fe8ac0d4..dd0b257c 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -289,9 +289,13 @@ def select_episode(self): ep_count = min(len(self.episodes), self.args['maximum_episodes']) ep_idx = random.randrange(ep_count) accept_rate = 1 - (ep_count - 1 - ep_idx) / ep_count - if random.random() < accept_rate: + if random.random() >= accept_rate: + continue + try: + ep = self.episodes[ep_idx] break - ep = self.episodes[ep_idx] + except IndexError: + continue turn_candidates = 1 + max(0, ep['steps'] - self.args['forward_steps']) # change start turn by sequence length train_st = random.randrange(turn_candidates) st = max(0, train_st - self.args['burn_in_steps']) From db9bc20f871afb0652ef4308a7032c067d614d0c Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 13 Dec 2022 21:03:20 +0900 Subject: [PATCH 17/26] fix: prevent array length error in win_rate_plot.py --- scripts/win_rate_plot.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/win_rate_plot.py b/scripts/win_rate_plot.py index cf18878e..8f7e4e8e 100644 --- a/scripts/win_rate_plot.py +++ b/scripts/win_rate_plot.py @@ -87,8 +87,9 @@ def get_wp_list(path): for opponent in opponents: wp_list = averaged_wp_lists[opponent] start = start_epoch[opponent] - # ax.plot(clipped_epoch_list[start:], wp_list[start:], label=opponent) - ax.plot(clipped_game_list[start:], wp_list[start:], label=opponent) + end = min(min(len(clipped_epoch_list), len(clipped_game_list)), len(wp_list)) + # ax.plot(clipped_epoch_list[start:end], wp_list[start:end], label=opponent) + ax.plot(clipped_game_list[start:end], wp_list[start:end], label=opponent) last_win_rate[opponent] = wp_list[-1] ax.set_xlabel('Games', size=14) From 875313dea181bbf563c292acac7ae893b781fa2f Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 22 Dec 2022 22:56:53 +0900 Subject: [PATCH 18/26] feature: return dict from evaluation function (same key) --- handyrl/evaluation.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/handyrl/evaluation.py b/handyrl/evaluation.py index 2d391bce..4ffe0192 100755 --- a/handyrl/evaluation.py +++ b/handyrl/evaluation.py @@ -106,7 +106,7 @@ def exec_match(env, agents, critic=None, show=False, game_args={}): outcome = env.outcome() if show: print('final outcome = %s' % outcome) - return outcome + return {'result': outcome} def exec_network_match(env, network_agents, critic=None, show=False, game_args={}): @@ -138,7 +138,7 @@ def exec_network_match(env, network_agents, critic=None, show=False, game_args={ outcome = env.outcome() for p, agent in network_agents.items(): agent.outcome(outcome[p]) - return outcome + return {'result': outcome} def build_agent(raw, env=None): @@ -170,11 +170,11 @@ def execute(self, models, args): else: agents[p] = Agent(model) - outcome = exec_match(self.env, agents) - if outcome is None: + results = exec_match(self.env, agents) + if results is None: print('None episode in evaluation!') return None - return {'args': args, 'result': outcome, 'opponent': opponent} + return {'args': args, 'opponent': opponent, **results} def wp_func(results): @@ -196,10 +196,10 @@ def eval_process_mp_child(agents, critic, env_args, index, in_queue, out_queue, print('*** Game %d ***' % g) agent_map = {env.players()[p]: agents[ai] for p, ai in enumerate(agent_ids)} if isinstance(list(agent_map.values())[0], NetworkAgent): - outcome = exec_network_match(env, agent_map, critic, show=show, game_args=game_args) + results = exec_network_match(env, agent_map, critic, show=show, game_args=game_args) else: - outcome = exec_match(env, agent_map, critic, show=show, game_args=game_args) - out_queue.put((pat_idx, agent_ids, outcome)) + results = exec_match(env, agent_map, critic, show=show, game_args=game_args) + out_queue.put((pat_idx, agent_ids, results)) out_queue.put(None) @@ -246,7 +246,8 @@ def evaluate_mp(env, agents, critic, env_args, args_patterns, num_process, num_g if ret is None: finished_cnt += 1 continue - pat_idx, agent_ids, outcome = ret + pat_idx, agent_ids, results = ret + outcome = results.get('outcome') if outcome is not None: for idx, p in enumerate(env.players()): agent_id = agent_ids[idx] From 50dcefdb020f607a3e1ec5a128ed98a21294ad65 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 22 Dec 2022 23:46:02 +0900 Subject: [PATCH 19/26] fix: output dict key outcome -> result --- handyrl/evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/evaluation.py b/handyrl/evaluation.py index 4ffe0192..45c4d225 100755 --- a/handyrl/evaluation.py +++ b/handyrl/evaluation.py @@ -247,7 +247,7 @@ def evaluate_mp(env, agents, critic, env_args, args_patterns, num_process, num_g finished_cnt += 1 continue pat_idx, agent_ids, results = ret - outcome = results.get('outcome') + outcome = results.get('result') if outcome is not None: for idx, p in enumerate(env.players()): agent_id = agent_ids[idx] From d63dfe40b2b41d5d3cfb0e149722adc4ebe5f3e5 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 5 Jan 2023 08:48:05 +0900 Subject: [PATCH 20/26] fix: remove map_location=cpu error after starting training --- handyrl/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/train.py b/handyrl/train.py index dd0b257c..93b93a97 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -432,7 +432,7 @@ def __init__(self, args, net=None, remote=False): self.worker = WorkerServer(args) if remote else WorkerCluster(args) # thread connection - self.trainer = Trainer(args, self.model) + self.trainer = Trainer(args, copy.deepcopy(self.model)) def model_path(self, model_id): return os.path.join('models', str(model_id) + '.pth') From c64b6b6d78c696d261afe4dd84b8aec77762455e Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 5 Jan 2023 17:32:20 +0900 Subject: [PATCH 21/26] feature: apply omask for two-player value averaging for solo-play episodes --- handyrl/train.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/handyrl/train.py b/handyrl/train.py index dd0b257c..1e0b9500 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -224,6 +224,7 @@ def compute_loss(batch, model, hidden, args): actions = batch['action'] emasks = batch['episode_mask'] + omasks = batch['observation_mask'] clip_rho_threshold, clip_c_threshold = 1.0, 1.0 log_selected_b_policies = torch.log(torch.clamp(batch['selected_prob'], 1e-16, 1)) * emasks @@ -239,8 +240,9 @@ def compute_loss(batch, model, hidden, args): if 'value' in outputs_nograd: values_nograd = outputs_nograd['value'] if args['turn_based_training'] and values_nograd.size(2) == 2: # two player zerosum game - values_nograd_opponent = -torch.stack([values_nograd[:, :, 1], values_nograd[:, :, 0]], dim=2) - values_nograd = (values_nograd + values_nograd_opponent) / (batch['observation_mask'].sum(dim=2, keepdim=True) + 1e-8) + values_nograd_opponent = -torch.flip(values_nograd, dims=[2]) + omasks_opponent = torch.flip(omasks, dims=[2]) + values_nograd = (values_nograd * omasks + values_nograd_opponent * omasks_opponent) / (omasks + omasks_opponent + 1e-8) outputs_nograd['value'] = values_nograd * emasks + batch['outcome'] * (1 - emasks) # compute targets and advantage From 3b375e1c1845701a84216eaf888701053c7d2710 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 5 Jan 2023 18:27:15 +0900 Subject: [PATCH 22/26] feature: data gathering code both for solo/multi player training --- handyrl/train.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/handyrl/train.py b/handyrl/train.py index dd0b257c..f4c75ade 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -63,15 +63,14 @@ def replace_none(a, b): # data that is changed by training configuration if args['turn_based_training'] and not args['observation']: - obs = [[m['observation'][m['turn'][0]]] for m in moments] - prob = np.array([[[m['selected_prob'][m['turn'][0]]]] for m in moments]) - act = np.array([[m['action'][m['turn'][0]]] for m in moments], dtype=np.int64)[..., np.newaxis] - amask = np.array([[m['action_mask'][m['turn'][0]]] for m in moments]) + players_list = [[m['turn'][0]] for m in moments] else: - obs = [[replace_none(m['observation'][player], obs_zeros) for player in players] for m in moments] - prob = np.array([[[replace_none(m['selected_prob'][player], 1.0)] for player in players] for m in moments]) - act = np.array([[replace_none(m['action'][player], 0) for player in players] for m in moments], dtype=np.int64)[..., np.newaxis] - amask = np.array([[replace_none(m['action_mask'][player], amask_zeros + 1e32) for player in players] for m in moments]) + players_list = [players for m in moments] + + obs = [[replace_none(m['observation'][player], obs_zeros) for player in players_] for m, players_ in zip(moments, players_list)] + prob = np.array([[[replace_none(m['selected_prob'][player], 1.0)] for player in players_] for m, players_ in zip(moments, players_list)]) + act = np.array([[replace_none(m['action'][player], 0) for player in players_] for m, players_ in zip(moments, players_list)], dtype=np.int64)[..., np.newaxis] + amask = np.array([[replace_none(m['action_mask'][player], amask_zeros + 1e32) for player in players_] for m, players_ in zip(moments, players_list)]) # reshape observation obs = rotate(rotate(obs)) # (T, P, ..., ...) -> (P, ..., T, ...) -> (..., T, P, ...) From 98b595d7f7d7777fc8fad25f77cab07bf27c0fc6 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 18 Apr 2023 19:42:59 +0900 Subject: [PATCH 23/26] fix: fill 0 for reward, return, value in make_batch() --- handyrl/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/handyrl/train.py b/handyrl/train.py index 5f8c43ae..7dad705f 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -77,9 +77,9 @@ def replace_none(a, b): obs = bimap_r(obs_zeros, obs, lambda _, o: np.array(o)) # datum that is not changed by training configuration - v = np.array([[replace_none(m['value'][player], [0]) for player in players] for m in moments], dtype=np.float32).reshape(len(moments), len(players), -1) - rew = np.array([[replace_none(m['reward'][player], [0]) for player in players] for m in moments], dtype=np.float32).reshape(len(moments), len(players), -1) - ret = np.array([[replace_none(m['return'][player], [0]) for player in players] for m in moments], dtype=np.float32).reshape(len(moments), len(players), -1) + v = np.array([[replace_none(m['value'][player], 0) for player in players] for m in moments], dtype=np.float32).reshape(len(moments), len(players), -1) + rew = np.array([[replace_none(m['reward'][player], 0) for player in players] for m in moments], dtype=np.float32).reshape(len(moments), len(players), -1) + ret = np.array([[replace_none(m['return'][player], 0) for player in players] for m in moments], dtype=np.float32).reshape(len(moments), len(players), -1) oc = np.array([ep['outcome'][player] for player in players], dtype=np.float32).reshape(1, len(players), -1) emask = np.ones((len(moments), 1, 1), dtype=np.float32) # episode mask From c5472e2e60315a7d045e5f82f8bcf7af3a1a0f1e Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 18 Apr 2023 19:44:50 +0900 Subject: [PATCH 24/26] feature: remove installation of kaggle environments from githb actions --- .github/workflows/action.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/action.yaml b/.github/workflows/action.yaml index 698e729f..28cbe9ca 100644 --- a/.github/workflows/action.yaml +++ b/.github/workflows/action.yaml @@ -24,7 +24,6 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt - pip install -r handyrl/envs/kaggle/requirements.txt - name: pytest run: | python -m pytest tests From 32018a564505c011abe597fb43a0668122bb1151 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 18 Apr 2023 19:50:14 +0900 Subject: [PATCH 25/26] feature: remove hungry_geese from environment test --- tests/test_environment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_environment.py b/tests/test_environment.py index 14ff9f64..5f0e9c23 100644 --- a/tests/test_environment.py +++ b/tests/test_environment.py @@ -8,7 +8,7 @@ 'tictactoe', 'geister', 'parallel_tictactoe', - 'kaggle.hungry_geese', + # 'kaggle.hungry_geese', ] From 2b4beeed758791e53b2e53c3b6811d79ced2f133 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Fri, 22 Sep 2023 22:08:26 +0900 Subject: [PATCH 26/26] feature: remove Python 3.7 from github CI actions --- .github/workflows/action.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/action.yaml b/.github/workflows/action.yaml index 698e729f..a066a8fc 100644 --- a/.github/workflows/action.yaml +++ b/.github/workflows/action.yaml @@ -12,7 +12,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ['3.7', '3.8', '3.9', '3.10'] + python-version: ['3.8', '3.9', '3.10'] steps: - name: Checkout uses: actions/checkout@v2