Skip to content
This repository has been archived by the owner on Jan 15, 2024. It is now read-only.

Commit

Permalink
Added ALBERT v2 quantization with INC example (#1591)
Browse files Browse the repository at this point in the history
* Add quantization to QA scripts

* fix

* Remove quantize bool field

* Fix electra large accuracy

* Update mkldnn to onednn

* Accuracy fix

* Add sphinx to dev requirments

* remove print

* change quantize_mode to proper one

* fix round_to argument

* Albert example

Co-authored-by: Bartlomiej Gawrych <[email protected]>
Co-authored-by: Bartlomiej Gawrych <[email protected]>
  • Loading branch information
3 people authored Dec 25, 2022
1 parent fecd3e1 commit 14553a0
Show file tree
Hide file tree
Showing 6 changed files with 1,103 additions and 14 deletions.
15 changes: 15 additions & 0 deletions scripts/question_answering/albert_custom.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
version: 1.0

model:
name: albert_base_v2
framework: mxnet

tuning:
strategy:
name: mycustom
accuracy_criterion:
relative: 0.02
exit_policy:
timeout: 0
max_trials: 1000
random_seed: 9527
176 changes: 176 additions & 0 deletions scripts/question_answering/custom_strategy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import copy
import numpy as np
from collections import OrderedDict
from neural_compressor.strategy.strategy import TuneStrategy, strategy_registry

plot_operator_influence = True

def calc_approx_error(expected_tensor: np.ndarray, observed_tensor: np.ndarray) -> float:
'''
Calculating relative error for one tensor
'''
error = observed_tensor - expected_tensor
absolute_error = np.abs(error)
mean_absolute_error = absolute_error.mean()
mean_expected_value = np.abs(expected_tensor).mean()
error = mean_absolute_error / mean_expected_value
return error


def get_approx_errors(expected_tensors, observed_tensors):
'''
Calculating relative error for multiple tensors: Dict[tensors_name: str, tensor: np.ndarray]
'''
errors = {}
for node_name in observed_tensors.keys():
expected_tensor = expected_tensors[node_name][node_name]
observed_tensor = observed_tensors[node_name][node_name]
errors[node_name] = calc_approx_error(expected_tensor, observed_tensor)
return errors


@strategy_registry
class MyCustomTuneStrategy(TuneStrategy):
'''INC Custom strategy definition'''
def __init__(self, model, conf, q_dataloader, q_func=None,
eval_dataloader=None, eval_func=None, dicts=None, q_hooks=None):
super().__init__(
model,
conf,
q_dataloader,
q_func,
eval_dataloader,
eval_func,
dicts,
q_hooks)


def get_qtensors(self, quant_cfg, node_list):
'''
Generating quantized model based on configuration and capturing intermediate tensors
'''
qmodel = self.adaptor.quantize(quant_cfg, self.model, self.calib_dataloader)
tensors = self.adaptor.inspect_tensor(qmodel, self.calib_dataloader, node_list, [1]) # 1 is a batch index
return tensors['activation'][0] # we need to specify that we want activation (layer output) because INC stores also weight tensors
# 0 is the first batch
def next_tune_cfg(self):
FALLBACK_DTYPE = 'fp32'

# creating base configuration - all nodes are quantized and calibrated with minmax algorithm
best_cfg = {}
best_cfg['calib_iteration'] = int(self.calib_iter[0]) # number of batches for calibration
best_cfg['calib_sampling_size'] = int(self.calib_sampling_size[0]) # number of samples for calibration (multiplicity of batch)
nodes_cfg = OrderedDict()
nodes_cfg_idx = {}
for node_key, cfgs in self.opwise_tune_cfgs.items():
for i, cfg in enumerate(cfgs):
if cfg['activation']['algorithm'] == 'minmax':
nodes_cfg_idx[node_key] = i
break
nodes_cfg[node_key] = cfg
best_cfg['op'] = nodes_cfg

yield best_cfg

# If fully quantized model does not meet the requirements, we proceed to exclude some nodes

# Collecting tensors from the original model - expected tensors
node_list = [op_name for (op_name, op_type) in best_cfg['op'].keys()]
f32_tensors = self.adaptor.inspect_tensor(self.model, self.calib_dataloader, node_list, [1])
f32_tensors = f32_tensors['activation'][0]

# Collecting tensors from the fully quantized model
q_tensors = self.get_qtensors(best_cfg, node_list)
approx_errors = get_approx_errors(f32_tensors, q_tensors)

# best_cfg['op'] is an OrderedDict, which order of elements should correspond to their
# order in the computational graph
for node_key, cfg in best_cfg['op'].items():
# Node's key in INC is its name + its operator
node_name, node_op = node_key
# Checking what configuration options are available for this particular node
capabilities = self.opwise_tune_space[node_key]['activation']['dtype']
# If a particular node can be excluded from quanrtization ('fp32' in capabilities)
# and current error is bigger than threshold value, we check what accuracy improvement
# would be achieved by this exclusion
if FALLBACK_DTYPE in capabilities and approx_errors[node_name] > 0.06:
original_dtype = cfg['activation']['dtype']
cfg['activation']['dtype'] = FALLBACK_DTYPE # Exclude the node from quantization

# Collecting tensors for a new configuration with the current node excluded
q_tensors = self.get_qtensors(best_cfg, node_list)
# Calculating errors for the new configuration
new_approx_errors = get_approx_errors(f32_tensors, q_tensors)
# Calculating error differences for every node in a model
err_diffs = {}
for tensor_node_name in new_approx_errors.keys():
diff = approx_errors[tensor_node_name] - new_approx_errors[tensor_node_name]
err_diffs[tensor_node_name] = diff
err_diffs_arr = np.array(list(err_diffs.values()))

# If the sum of errors on the following layers is greater than the threshold value we
# keep the node excluded
threshold_sum_error_layers = err_diffs_arr.size * 0.01
if err_diffs_arr.sum() >= threshold_sum_error_layers:
before = approx_errors
after = approx_errors.copy()
after.update(new_approx_errors)
if plot_operator_influence:
import matplotlib.pyplot as plt
plt.figure()
plt.plot(before.values(), marker='o', markersize=2.5, label='Before')
plt.plot(after.values(), marker='o', markersize=2.5, label='After')
plt.ylabel('Relative error')
plt.xlabel('Layer')
plt.legend()
plt.savefig(f'{node_name}_error.png')

approx_errors.update(new_approx_errors)
nodes_cfg_idx.pop(node_key) # Mark node as not quantizable
else:
cfg['activation']['dtype'] = original_dtype

yield best_cfg

# Choosing calibration algorithm (kl or minmax) for every node which was not excluded from quantization
for cfg in self.bayesian_configurations(best_cfg, nodes_cfg_idx):
yield cfg

def bayesian_params_to_tune_configs(self, params):
'''
Creating configuration from params - changing configurations' indexes for real configurations
'''
node_cfgs = {}
for node_key, configs in self.opwise_quant_cfgs.items():
if node_key in params:
value = int(params[node_key])
value = min(value, len(configs) - 1)
node_cfgs[node_key] = copy.deepcopy(configs[value])
return node_cfgs

def bayesian_configurations(self, cfg_base, params_base):
from neural_compressor.strategy.bayesian import BayesianOptimization

# For each node we specify the possible range of values (we treat them as a configurations' index)
pbounds = {}
for node_key, configs in self.opwise_quant_cfgs.items():
if node_key in params_base and len(configs) > 1:
pbounds[node_key] = (0, len(configs))

cfg = copy.deepcopy(cfg_base)
if len(pbounds) == 0: # if there is nothing to be optimized, we finish
cfg['op'].update(self.bayesian_params_to_tune_configs(params_base))
return

bayes_opt = BayesianOptimization(pbounds=pbounds, random_seed=self.cfg.tuning.random_seed)
bayes_opt._space.register(params_base, self.last_tune_result[0]) # registering the outcome of current configuration
while True:
# Generating next configuration
params = bayes_opt.gen_next_params()
cfg['op'].update(self.bayesian_params_to_tune_configs(params))
yield cfg
try:
# Registering the outcome
bayes_opt._space.register(params, self.last_tune_result[0])
except KeyError:
pass
18 changes: 14 additions & 4 deletions scripts/question_answering/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ def __init__(self, backbone, units=768, layer_norm_eps=1E-12, dropout_prob=0.1,
self.answerable_scores.add(nn.Dense(2, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer))
self.quantized_backbone = None

def get_start_logits(self, contextual_embedding, p_mask):
"""
Expand Down Expand Up @@ -287,10 +288,14 @@ def forward(self, tokens, token_types, valid_length, p_mask, start_position):
Shape (batch_size, sequence_length)
answerable_logits
"""
backbone_net = self.backbone
if self.quantized_backbone != None:
backbone_net = self.quantized_backbone

if self.use_segmentation:
contextual_embeddings = self.backbone(tokens, token_types, valid_length)
contextual_embeddings = backbone_net(tokens, token_types, valid_length)
else:
contextual_embeddings = self.backbone(tokens, valid_length)
contextual_embeddings = backbone_net(tokens, valid_length)
start_logits = self.get_start_logits(contextual_embeddings, p_mask)
end_logits = self.get_end_logits(contextual_embeddings,
np.expand_dims(start_position, axis=1),
Expand Down Expand Up @@ -337,11 +342,16 @@ def inference(self, tokens, token_types, valid_length, p_mask,
The answerable logits. Here 0 --> answerable and 1 --> not answerable.
Shape (batch_size, sequence_length, 2)
"""
backbone_net = self.backbone
if self.quantized_backbone != None:
backbone_net = self.quantized_backbone

# Shape (batch_size, sequence_length, C)
if self.use_segmentation:
contextual_embeddings = self.backbone(tokens, token_types, valid_length)
contextual_embeddings = backbone_net(tokens, token_types, valid_length)
else:
contextual_embeddings = self.backbone(tokens, valid_length)
contextual_embeddings = backbone_net(tokens, valid_length)

start_logits = self.get_start_logits(contextual_embeddings, p_mask)
# The shape of start_top_index will be (..., start_top_n)
start_top_logits, start_top_index = mx.npx.topk(start_logits, k=start_top_n, axis=-1,
Expand Down
Loading

0 comments on commit 14553a0

Please sign in to comment.