From 80b54fea2f0404a2358eac6d3a59828f8a529475 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Mon, 31 Jan 2022 09:06:31 +0900 Subject: [PATCH 1/7] feature: apply lambda=1 in the timestep that there is no value output --- handyrl/losses.py | 27 ++++++++++++++++----------- handyrl/train.py | 7 +++++-- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/handyrl/losses.py b/handyrl/losses.py index 2e2f0da3..326d6639 100755 --- a/handyrl/losses.py +++ b/handyrl/losses.py @@ -17,30 +17,32 @@ def monte_carlo(values, returns): return returns, returns - values -def temporal_difference(values, returns, rewards, lmb, gamma): +def temporal_difference(values, returns, rewards, lambda_, gamma): target_values = deque([returns[:, -1]]) for i in range(values.size(1) - 2, -1, -1): reward = rewards[:, i] if rewards is not None else 0 - target_values.appendleft(reward + gamma * ((1 - lmb) * values[:, i + 1] + lmb * target_values[0])) + lamb = lambda_[:, i + 1] + target_values.appendleft(reward + gamma * ((1 - lamb) * values[:, i + 1] + lamb * target_values[0])) target_values = torch.stack(tuple(target_values), dim=1) return target_values, target_values - values -def upgo(values, returns, rewards, lmb, gamma): +def upgo(values, returns, rewards, lambda_, gamma): target_values = deque([returns[:, -1]]) for i in range(values.size(1) - 2, -1, -1): value = values[:, i + 1] reward = rewards[:, i] if rewards is not None else 0 - target_values.appendleft(reward + gamma * torch.max(value, (1 - lmb) * value + lmb * target_values[0])) + lamb = lambda_[:, i + 1] + target_values.appendleft(reward + gamma * torch.max(value, (1 - lamb) * value + lamb * target_values[0])) target_values = torch.stack(tuple(target_values), dim=1) return target_values, target_values - values -def vtrace(values, returns, rewards, lmb, gamma, rhos, cs): +def vtrace(values, returns, rewards, lambda_, gamma, rhos, cs): rewards = rewards if rewards is not None else 0 values_t_plus_1 = torch.cat([values[:, 1:], returns[:, -1:]], dim=1) deltas = rhos * (rewards + gamma * values_t_plus_1 - values) @@ -48,7 +50,7 @@ def vtrace(values, returns, rewards, lmb, gamma, rhos, cs): # compute Vtrace value target recursively vs_minus_v_xs = deque([deltas[:, -1]]) for i in range(values.size(1) - 2, -1, -1): - vs_minus_v_xs.appendleft(deltas[:, i] + gamma * lmb * cs[:, i] * vs_minus_v_xs[0]) + vs_minus_v_xs.appendleft(deltas[:, i] + gamma * lambda_[:, i + 1] * cs[:, i] * vs_minus_v_xs[0]) vs_minus_v_xs = torch.stack(tuple(vs_minus_v_xs), dim=1) vs = vs_minus_v_xs + values @@ -58,18 +60,21 @@ def vtrace(values, returns, rewards, lmb, gamma, rhos, cs): return vs, advantages -def compute_target(algorithm, values, returns, rewards, lmb, gamma, rhos, cs): +def compute_target(algorithm, values, returns, rewards, lmb, gamma, rhos, cs, masks): if values is None: # In the absence of a baseline, Monte Carlo returns are used. return returns, returns if algorithm == 'MC': return monte_carlo(values, returns) - elif algorithm == 'TD': - return temporal_difference(values, returns, rewards, lmb, gamma) + + lambda_ = lmb + (1 - lmb) * masks + + if algorithm == 'TD': + return temporal_difference(values, returns, rewards, lambda_, gamma) elif algorithm == 'UPGO': - return upgo(values, returns, rewards, lmb, gamma) + return upgo(values, returns, rewards, lambda_, gamma) elif algorithm == 'VTRACE': - return vtrace(values, returns, rewards, lmb, gamma, rhos, cs) + return vtrace(values, returns, rewards, lambda_, gamma, rhos, cs) else: print('No algorithm named %s' % algorithm) diff --git a/handyrl/train.py b/handyrl/train.py index 031baa35..b3e4e061 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -219,6 +219,7 @@ def compute_loss(batch, model, hidden, args): actions = batch['action'] emasks = batch['episode_mask'] + target_masks = batch['observation_mask'] clip_rho_threshold, clip_c_threshold = 1.0, 1.0 log_selected_b_policies = torch.log(torch.clamp(batch['selected_prob'], 1e-16, 1)) * emasks @@ -236,14 +237,16 @@ def compute_loss(batch, model, hidden, args): if args['turn_based_training'] and values_nograd.size(2) == 2: # two player zerosum game values_nograd_opponent = -torch.stack([values_nograd[:, :, 1], values_nograd[:, :, 0]], dim=2) values_nograd = (values_nograd + values_nograd_opponent) / (batch['observation_mask'].sum(dim=2, keepdim=True) + 1e-8) + target_masks_inv = torch.stack([batch['observation_mask'][:, :, 1], values_nograd[:, :, 0]], dim=2) + target_masks = torch.clamp(target_masks + target_masks_inv, 0, 1) outputs_nograd['value'] = values_nograd * emasks + batch['outcome'] * (1 - emasks) # compute targets and advantage targets = {} advantages = {} - value_args = outputs_nograd.get('value', None), batch['outcome'], None, args['lambda'], 1, clipped_rhos, cs - return_args = outputs_nograd.get('return', None), batch['return'], batch['reward'], args['lambda'], args['gamma'], clipped_rhos, cs + value_args = outputs_nograd.get('value', None), batch['outcome'], None, args['lambda'], 1, clipped_rhos, cs, target_masks + return_args = outputs_nograd.get('return', None), batch['return'], batch['reward'], args['lambda'], args['gamma'], clipped_rhos, cs, target_masks targets['value'], advantages['value'] = compute_target(args['value_target'], *value_args) targets['return'], advantages['return'] = compute_target(args['value_target'], *return_args) From bcf7c4bb799be03c204e5263a7c0c14834a07ecf Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 10 Mar 2022 20:29:32 +0900 Subject: [PATCH 2/7] fix: reversed target mask --- handyrl/losses.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/losses.py b/handyrl/losses.py index 326d6639..16e3d69c 100755 --- a/handyrl/losses.py +++ b/handyrl/losses.py @@ -68,7 +68,7 @@ def compute_target(algorithm, values, returns, rewards, lmb, gamma, rhos, cs, ma if algorithm == 'MC': return monte_carlo(values, returns) - lambda_ = lmb + (1 - lmb) * masks + lambda_ = lmb + (1 - lmb) * (1 - masks) if algorithm == 'TD': return temporal_difference(values, returns, rewards, lambda_, gamma) From 5da6f59871853048b943c3067a4640a685f05bcf Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 10 Mar 2022 20:30:40 +0900 Subject: [PATCH 3/7] fix: value_target_masks and return_target_masks should be different --- handyrl/train.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/handyrl/train.py b/handyrl/train.py index b3e4e061..aecd6528 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -219,7 +219,7 @@ def compute_loss(batch, model, hidden, args): actions = batch['action'] emasks = batch['episode_mask'] - target_masks = batch['observation_mask'] + value_target_masks, return_target_masks = batch['observation_mask'], batch['observation_mask'] clip_rho_threshold, clip_c_threshold = 1.0, 1.0 log_selected_b_policies = torch.log(torch.clamp(batch['selected_prob'], 1e-16, 1)) * emasks @@ -237,16 +237,16 @@ def compute_loss(batch, model, hidden, args): if args['turn_based_training'] and values_nograd.size(2) == 2: # two player zerosum game values_nograd_opponent = -torch.stack([values_nograd[:, :, 1], values_nograd[:, :, 0]], dim=2) values_nograd = (values_nograd + values_nograd_opponent) / (batch['observation_mask'].sum(dim=2, keepdim=True) + 1e-8) - target_masks_inv = torch.stack([batch['observation_mask'][:, :, 1], values_nograd[:, :, 0]], dim=2) - target_masks = torch.clamp(target_masks + target_masks_inv, 0, 1) + value_target_masks_inv = torch.stack([value_target_masks[:, :, 1], value_target_masks[:, :, 0]], dim=2) + value_target_masks = torch.clamp(value_target_masks + value_target_masks_inv, 0, 1) outputs_nograd['value'] = values_nograd * emasks + batch['outcome'] * (1 - emasks) # compute targets and advantage targets = {} advantages = {} - value_args = outputs_nograd.get('value', None), batch['outcome'], None, args['lambda'], 1, clipped_rhos, cs, target_masks - return_args = outputs_nograd.get('return', None), batch['return'], batch['reward'], args['lambda'], args['gamma'], clipped_rhos, cs, target_masks + value_args = outputs_nograd.get('value', None), batch['outcome'], None, args['lambda'], 1, clipped_rhos, cs, value_target_masks + return_args = outputs_nograd.get('return', None), batch['return'], batch['reward'], args['lambda'], args['gamma'], clipped_rhos, cs, return_target_masks targets['value'], advantages['value'] = compute_target(args['value_target'], *value_args) targets['return'], advantages['return'] = compute_target(args['value_target'], *return_args) From 9909c4138d8ba2101997b670f5f941b2859cee69 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 24 Nov 2022 16:37:42 +0900 Subject: [PATCH 4/7] fix: episode selection index error --- handyrl/train.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/handyrl/train.py b/handyrl/train.py index fe8ac0d4..dd0b257c 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -289,9 +289,13 @@ def select_episode(self): ep_count = min(len(self.episodes), self.args['maximum_episodes']) ep_idx = random.randrange(ep_count) accept_rate = 1 - (ep_count - 1 - ep_idx) / ep_count - if random.random() < accept_rate: + if random.random() >= accept_rate: + continue + try: + ep = self.episodes[ep_idx] break - ep = self.episodes[ep_idx] + except IndexError: + continue turn_candidates = 1 + max(0, ep['steps'] - self.args['forward_steps']) # change start turn by sequence length train_st = random.randrange(turn_candidates) st = max(0, train_st - self.args['burn_in_steps']) From db9bc20f871afb0652ef4308a7032c067d614d0c Mon Sep 17 00:00:00 2001 From: YuriCat Date: Tue, 13 Dec 2022 21:03:20 +0900 Subject: [PATCH 5/7] fix: prevent array length error in win_rate_plot.py --- scripts/win_rate_plot.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/win_rate_plot.py b/scripts/win_rate_plot.py index cf18878e..8f7e4e8e 100644 --- a/scripts/win_rate_plot.py +++ b/scripts/win_rate_plot.py @@ -87,8 +87,9 @@ def get_wp_list(path): for opponent in opponents: wp_list = averaged_wp_lists[opponent] start = start_epoch[opponent] - # ax.plot(clipped_epoch_list[start:], wp_list[start:], label=opponent) - ax.plot(clipped_game_list[start:], wp_list[start:], label=opponent) + end = min(min(len(clipped_epoch_list), len(clipped_game_list)), len(wp_list)) + # ax.plot(clipped_epoch_list[start:end], wp_list[start:end], label=opponent) + ax.plot(clipped_game_list[start:end], wp_list[start:end], label=opponent) last_win_rate[opponent] = wp_list[-1] ax.set_xlabel('Games', size=14) From d63dfe40b2b41d5d3cfb0e149722adc4ebe5f3e5 Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 5 Jan 2023 08:48:05 +0900 Subject: [PATCH 6/7] fix: remove map_location=cpu error after starting training --- handyrl/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/handyrl/train.py b/handyrl/train.py index dd0b257c..93b93a97 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -432,7 +432,7 @@ def __init__(self, args, net=None, remote=False): self.worker = WorkerServer(args) if remote else WorkerCluster(args) # thread connection - self.trainer = Trainer(args, self.model) + self.trainer = Trainer(args, copy.deepcopy(self.model)) def model_path(self, model_id): return os.path.join('models', str(model_id) + '.pth') From c64b6b6d78c696d261afe4dd84b8aec77762455e Mon Sep 17 00:00:00 2001 From: YuriCat Date: Thu, 5 Jan 2023 17:32:20 +0900 Subject: [PATCH 7/7] feature: apply omask for two-player value averaging for solo-play episodes --- handyrl/train.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/handyrl/train.py b/handyrl/train.py index dd0b257c..1e0b9500 100755 --- a/handyrl/train.py +++ b/handyrl/train.py @@ -224,6 +224,7 @@ def compute_loss(batch, model, hidden, args): actions = batch['action'] emasks = batch['episode_mask'] + omasks = batch['observation_mask'] clip_rho_threshold, clip_c_threshold = 1.0, 1.0 log_selected_b_policies = torch.log(torch.clamp(batch['selected_prob'], 1e-16, 1)) * emasks @@ -239,8 +240,9 @@ def compute_loss(batch, model, hidden, args): if 'value' in outputs_nograd: values_nograd = outputs_nograd['value'] if args['turn_based_training'] and values_nograd.size(2) == 2: # two player zerosum game - values_nograd_opponent = -torch.stack([values_nograd[:, :, 1], values_nograd[:, :, 0]], dim=2) - values_nograd = (values_nograd + values_nograd_opponent) / (batch['observation_mask'].sum(dim=2, keepdim=True) + 1e-8) + values_nograd_opponent = -torch.flip(values_nograd, dims=[2]) + omasks_opponent = torch.flip(omasks, dims=[2]) + values_nograd = (values_nograd * omasks + values_nograd_opponent * omasks_opponent) / (omasks + omasks_opponent + 1e-8) outputs_nograd['value'] = values_nograd * emasks + batch['outcome'] * (1 - emasks) # compute targets and advantage