Skip to content

Commit

Permalink
Better weight clipping. Do it in the model instead of the optimizer.
Browse files Browse the repository at this point in the history
  • Loading branch information
Sopel97 committed May 18, 2021
1 parent e34c785 commit 4a966fc
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 14 deletions.
52 changes: 46 additions & 6 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def coalesce_ft_weights(model, layer):
for i_real, is_virtual in enumerate(indices):
weight_coalesced[i_real, :] = sum(weight[i_virtual, :] for i_virtual in is_virtual)
return weight_coalesced

def get_parameters(layers):
return [p for layer in layers for p in layer.parameters()]

Expand Down Expand Up @@ -129,6 +129,12 @@ def __init__(self, feature_set, lambda_=1.0):
self.layer_stacks = LayerStacks(self.num_ls_buckets)
self.lambda_ = lambda_

self.weight_clipping = [
{'params' : [self.layer_stacks.l1.weight], 'min_weight' : -127/64, 'max_weight' : 127/64, 'virtual_params' : self.layer_stacks.l1_fact.weight },
{'params' : [self.layer_stacks.l2.weight], 'min_weight' : -127/64, 'max_weight' : 127/64 },
{'params' : [self.layer_stacks.output.weight], 'min_weight' : -127*127/9600, 'max_weight' : 127*127/9600 },
]

self._init_layers()

'''
Expand Down Expand Up @@ -168,6 +174,35 @@ def _init_psqt(self):
input_weights[:, L1 + i] = torch.FloatTensor(initial_values) * scale
self.input.weight = nn.Parameter(input_weights)

'''
Clips the weights of the model based on the min/max values allowed
by the quantization scheme.
'''
def _clip_weights(self):
for group in self.weight_clipping:
for p in group['params']:
if 'min_weight' in group or 'max_weight' in group:
p_data_fp32 = p.data
min_weight = group['min_weight']
max_weight = group['max_weight']
if 'virtual_params' in group:
virtual_params = group['virtual_params']
xs = p_data_fp32.shape[0] // virtual_params.shape[0]
ys = p_data_fp32.shape[1] // virtual_params.shape[1]
expanded_virtual_layer = virtual_params.repeat(xs, ys)
if min_weight is not None:
min_weight_t = p_data_fp32.new_full(p_data_fp32.shape, min_weight) - expanded_virtual_layer
p_data_fp32 = torch.max(p_data_fp32, min_weight_t)
if max_weight is not None:
max_weight_t = p_data_fp32.new_full(p_data_fp32.shape, max_weight) - expanded_virtual_layer
p_data_fp32 = torch.min(p_data_fp32, max_weight_t)
else:
if min_weight is not None and max_weight is not None:
p_data_fp32.clamp_(min_weight, max_weight)
else:
raise Exception('Not supported.')
p.data.copy_(p_data_fp32)

'''
This method attempts to convert the model from using the self.feature_set
to new_feature_set.
Expand Down Expand Up @@ -223,6 +258,8 @@ def forward(self, us, them, white_indices, white_values, black_indices, black_va
return x

def step_(self, batch, batch_idx, loss_type):
self._clip_weights()

us, them, white_indices, white_values, black_indices, black_values, outcome, score, psqt_indices, layer_stack_indices = batch

# 600 is the kPonanzaConstant scaling factor needed to convert the training net output to a score.
Expand Down Expand Up @@ -264,11 +301,14 @@ def configure_optimizers(self):
# Train with a lower LR on the output layer
LR = 1e-3
train_params = [
{'params' : get_parameters([self.input]), 'lr' : LR, 'min_weight' : -(2**15-1)/127, 'max_weight' : (2**15-1)/127 },
{'params' : get_parameters([self.layer_stacks.l1, self.layer_stacks.l2]), 'lr' : LR, 'min_weight' : -127/64, 'max_weight' : 127/64 },
# factorization kinda breaks the min/max weight clipping, but not sure we can do better
{'params' : get_parameters([self.layer_stacks.l1_fact]), 'lr' : LR, 'min_weight' : -127/64, 'max_weight' : 127/64 },
{'params' : get_parameters([self.layer_stacks.output]), 'lr' : LR / 10, 'min_weight' : -127*127/9600, 'max_weight' : 127*127/9600 },
{'params' : get_parameters([self.input]), 'lr' : LR, 'gc_dim' : 0 },
{'params' : [self.layer_stacks.l1_fact.weight], 'lr' : LR },
{'params' : [self.layer_stacks.l1.weight], 'lr' : LR },
{'params' : [self.layer_stacks.l1.bias], 'lr' : LR },
{'params' : [self.layer_stacks.l2.weight], 'lr' : LR },
{'params' : [self.layer_stacks.l2.bias], 'lr' : LR },
{'params' : [self.layer_stacks.output.weight], 'lr' : LR / 10 },
{'params' : [self.layer_stacks.output.bias], 'lr' : LR / 10 },
]
# increasing the eps leads to less saturated nets with a few dead neurons
optimizer = ranger.Ranger(train_params, betas=(.9, 0.999), eps=1.0e-7)
Expand Down
10 changes: 2 additions & 8 deletions ranger.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ def __init__(self, params, lr=1e-3, # lr
alpha=0.5, k=6, N_sma_threshhold=5, # Ranger options
betas=(.95, 0.999), eps=1e-5, weight_decay=0, # Adam options
# Gradient centralization on or off, applied to conv layers only or conv + fc layers
use_gc=True, gc_conv_only=False, gc_loc=True,
min_weight=-1000000.0, max_weight=1000000
use_gc=True, gc_conv_only=False, gc_loc=True
):

# parameter checks
Expand All @@ -65,7 +64,7 @@ def __init__(self, params, lr=1e-3, # lr
# prep defaults and init torch.optim base
defaults = dict(lr=lr, alpha=alpha, k=k, step_counter=0, betas=betas,
N_sma_threshhold=N_sma_threshhold, eps=eps, weight_decay=weight_decay,
min_weight=min_weight, max_weight=max_weight, gc_dim=None)
gc_dim=None)
super().__init__(params, defaults)

# adjustable threshold
Expand Down Expand Up @@ -191,11 +190,6 @@ def step(self, closure=None):

p_data_fp32.add_(G_grad, alpha=-step_size * group['lr'])

# constrain weights
min_weight = group['min_weight']
max_weight = group['max_weight']
p_data_fp32.clamp_(min_weight, max_weight)

p.data.copy_(p_data_fp32)

# integrated look ahead...
Expand Down

0 comments on commit 4a966fc

Please sign in to comment.