Merge pull request #7 from anzr299/origin/nncf_compression

cavusmustafa · web-flow · commit 9a1dff216e9f · 2025-07-07T13:46:32.000-07:00
[NNCF] WC Support in OVQuantizer
diff --git a/backends/openvino/quantizer/observers/nncf_observers.py b/backends/openvino/quantizer/observers/nncf_observers.py
@@ -0,0 +1,114 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from torch.ao.quantization.observer import MappingType, PerGroup, PerAxis, PerChannelMinMaxObserver, get_block_size
+from torch.ao.quantization.pt2e._affine_quantization import (
+    _get_reduction_params,
+    AffineQuantizedMinMaxObserver,
+)
+from nncf.torch.quantization.layers import INT4AsymmetricWeightsDecompressor, INT4SymmetricWeightsDecompressor, INT8AsymmetricWeightsDecompressor, INT8SymmetricWeightsDecompressor
+from nncf.experimental.torch.fx.transformations import constant_update_fn, module_insertion_transformation_builder
+from nncf.experimental.torch.fx.node_utils import get_tensor_constant_from_node
+from nncf.torch.graph.transformations.commands import PTTargetPoint, TargetType
+
+from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.parameters import CompressWeightsMode
+from nncf.tensor.tensor import Tensor
+
+class PTPerBlockParamObserver(AffineQuantizedMinMaxObserver):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        qmode = CompressWeightsMode.INT4_ASYM if self.mapping_type==MappingType.ASYMMETRIC else CompressWeightsMode.INT4_SYM 
+        assert isinstance(self.granularity, PerGroup), "Only PerGroup granularity is supported"
+        self.wc_config = WeightCompressionConfig(mode=qmode, group_size=self.granularity.group_size)
+
+    def calculate_qparams(self, weight):
+        assert hasattr(self, "min_val") and hasattr(
+            self, "max_val"
+        ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
+        _, reduction_dims = _get_reduction_params(
+            self.block_size, weight.size()
+            )
+        assert len(reduction_dims) == 1, "Only 1-D group size is supported"
+        reduction_dims = reduction_dims[0] - 1
+        q_weight, scale, zp = do_integer_quantization(Tensor(weight), self.wc_config, reduction_axes=reduction_dims)
+        zp = zp.data if zp is not None else None
+        return q_weight.data, scale.data, zp
+
+    def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
+        print("calling convert")
+        assert (
+            self.original_dtype is not None
+        ), "Expecting original_dtype to be populated"
+        weight_node = observer_node.args[0]
+        original_weight = get_tensor_constant_from_node(weight_node, model)
+        q_weight, scale, zero_point = self.calculate_qparams(original_weight)
+        
+        with model.graph.inserting_before(observer_node):
+            if(zero_point is not None):
+                decompressor = INT4AsymmetricWeightsDecompressor(scale, zero_point, q_weight.shape, original_weight.shape, original_weight.dtype)
+            else:
+                decompressor = INT4SymmetricWeightsDecompressor(scale, q_weight.shape, original_weight.shape, original_weight.dtype)
+            packed_q_weight = decompressor.pack_weight(q_weight)
+            new_weight_node = constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
+            decompressor_name = f'NNCFDecompressor_{new_weight_node.name}'
+
+            module_insertion_transformation_builder(
+                        decompressor,
+                        [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=new_weight_node.name)],
+                        decompressor_name,
+                    )(model)
+        decomp_node = observer_node.args[0]
+        observer_node.replace_all_uses_with(decomp_node)
+        model.graph.erase_node(observer_node)
+
+
+class NNCFInt8observer(PerChannelMinMaxObserver):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        qmode = CompressWeightsMode.INT8_SYM if self.qscheme==torch.per_channel_symmetric else CompressWeightsMode.INT8_ASYM 
+        self.wc_config = WeightCompressionConfig(mode=qmode)
+
+    def calculate_qparams(self, weight):
+        assert hasattr(self, "min_val") and hasattr(
+            self, "max_val"
+        ), "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
+        self.granularity = PerAxis(axis=self.ch_axis)
+        self.block_size = get_block_size(weight.shape, self.granularity)
+        _, reduction_dims = _get_reduction_params(
+            self.block_size, weight.size()
+            )
+        q_weight, scale, zp = do_integer_quantization(Tensor(weight), self.wc_config, reduction_axes=reduction_dims)
+        zp = zp.data if zp is not None else None
+        return q_weight.data, scale.data, zp
+
+    def convert(self, model: torch.fx.GraphModule, observer_node: torch.fx.Node):
+        print("calling convert")
+        weight_node = observer_node.args[0]
+        original_weight = get_tensor_constant_from_node(weight_node, model)
+        q_weight, scale, zero_point = self.calculate_qparams(original_weight)
+
+        with model.graph.inserting_before(observer_node):
+            if(zero_point is not None):
+                decompressor = INT8AsymmetricWeightsDecompressor(scale, zero_point, original_weight.dtype)
+            else:
+                decompressor = INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
+            packed_q_weight = decompressor.pack_weight(q_weight)
+            new_weight_node = constant_update_fn(model, observer_node, packed_q_weight, input_port_id=0)
+            decompressor_name = f'NNCFDecompressor_{new_weight_node.name}'
+
+            module_insertion_transformation_builder(
+                        decompressor,
+                        [PTTargetPoint(TargetType.OPERATOR_POST_HOOK, target_node_name=new_weight_node.name)],
+                        decompressor_name,
+                    )(model)
+        decomp_node = observer_node.args[0]
+        observer_node.replace_all_uses_with(decomp_node)
+        model.graph.erase_node(observer_node)
diff --git a/backends/openvino/quantizer/quantizer.py b/backends/openvino/quantizer/quantizer.py
@@ -21,6 +21,8 @@
     HistogramObserver,
     PerChannelMinMaxObserver,
     UniformQuantizationObserverBase,
+    PerGroup,
+    MappingType,
 )
 from torchao.quantization.pt2e.quantizer import (
     EdgeOrNode,
@@ -30,6 +32,9 @@
     Quantizer,
     SharedQuantizationSpec,
 )
+from nncf.quantization.quantize_model import get_weight_compression_configuration
+from nncf.common.quantization.structs import QuantizerConfig, QuantizationScheme
+from executorch.backends.openvino.quantizer.observers.nncf_observers import PTPerBlockParamObserver,NNCFInt8observer
 
 QUANT_ANNOTATION_KEY = "quantization_annotation"
 
@@ -46,6 +51,10 @@ class QuantizationMode(Enum):
     INT8_SYM = "int8_sym"
     INT8_MIXED = "int8_mixed"
     INT8_TRANSFORMER = "int8_transformer"
+    INT8_SYM_WC = "int8_sym_wc"
+    INT8_ASYM_WC = "int8_asym_wc"
+    INT4_SYM_WC = "int4_sym"
+    INT4_ASYM_WC = "int4_asym"
 
 
 class OpenVINOQuantizer(Quantizer):
@@ -66,8 +75,12 @@ def __init__(
             - INT8_MIXED: INT8 asymmetric quantization for activations, symmetric for weights.
             - INT8_TRANSFORMER: Optimized INT8 quantization for transformer-based models
             Default value is INT8_SYM.
+            - INT4_SYM: Symmetric INT4 Weights-Only Compression
+            - INT4_ASYM: Asymmetric INT4 Weights-Only Compression
         :param kwargs: Arguments to pass to the NNCF MinMaxQuantization algorithm.
         """
+        self.mode = mode
+        self.wc_modes = [QuantizationMode.INT4_ASYM_WC,QuantizationMode.INT4_SYM_WC, QuantizationMode.INT8_ASYM_WC, QuantizationMode.INT8_SYM_WC]
         if mode == QuantizationMode.INT8_SYM:
             preset = quantization.structs.QuantizationPreset.PERFORMANCE
             model_type = None
@@ -77,11 +90,24 @@ def __init__(
         else:
             preset = None
             model_type = nncf.parameters.ModelType.TRANSFORMER
-        self._min_max_algo = (
-            nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization(
-                preset=preset, model_type=model_type, **kwargs
+        if(self.mode not in self.wc_modes):
+            self._min_max_algo = (
+                nncf.quantization.algorithms.min_max.algorithm.MinMaxQuantization(
+                    preset=preset, model_type=model_type, **kwargs
+                )
             )
-        )
+            self._algo = self._min_max_algo
+        else:
+            weight_compression_configuration = get_weight_compression_configuration(
+                mode.value.replace("_wc", ""), # Mode value has to match NNCF CompressWeightsMode
+                **kwargs
+            )
+            self._weight_compression_algo = nncf.quantization.algorithms.weight_compression.algorithm.WeightCompression(
+                subset_size=None,
+                **weight_compression_configuration
+            )
+            self._algo = self._weight_compression_algo
+
 
     def set_ignored_scope(
         self,
@@ -102,7 +128,7 @@ def set_ignored_scope(
         :param validate: If set to True, then a RuntimeError will be raised if any ignored scope does not match
           in the model graph.
         """
-        self._min_max_algo.set_ignored_scope(
+        self._algo.set_ignored_scope(
             nncf.IgnoredScope(
                 names=names or [],
                 patterns=patterns or [],
@@ -115,63 +141,80 @@ def set_ignored_scope(
     def get_nncf_quantization_setup(
         self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph
     ) -> quantization.quantizer_setup.SingleConfigQuantizerSetup:
-        self._min_max_algo._set_backend_entity(model)
-        return self._min_max_algo.find_quantization_setup(model, nncf_graph)
+        self._algo._set_backend_entity(model)
+        return self._algo.find_quantization_setup(model, nncf_graph)
 
     def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
         nncf_graph = nncf_fx.nncf_graph_builder.GraphConverter.create_nncf_graph(model)
-        quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
-
+        
         graph = model.graph
         node_vs_torch_annotation: DefaultDict[torch.fx.Node, QuantizationAnnotation] = (
             defaultdict(QuantizationAnnotation)
         )
+        # Serperate into annotation for quantize and compress
+        if(self.mode in self.wc_modes):
+            self._algo.set_backend_entity(model)
+            nodes_to_compress = self._algo.get_nodes_to_compress(nncf_graph)
+            for node in nodes_to_compress:
+                quantization_insertion_point = quantization.quantizer_setup.WeightQuantizationInsertionPoint(target_node_name=node.node_name)
+                group_size = self._algo._group_size
+                num_bits = 4 if self.mode in [QuantizationMode.INT4_SYM_WC,QuantizationMode.INT4_ASYM_WC] else 8
+                qmode = QuantizationScheme.SYMMETRIC if self.mode in [QuantizationMode.INT4_SYM_WC,QuantizationMode.INT8_SYM_WC] else QuantizationScheme.ASYMMETRIC
+                nncf_qconfig = QuantizerConfig(num_bits=num_bits, mode=qmode)
+                qp = quantization.quantizer_setup.SingleConfigQuantizationPoint(qip=quantization_insertion_point, qconfig=nncf_qconfig, directly_quantized_operator_node_names=[node])
+                edge_or_node, annotation = self._get_edge_or_node_and_annotation(
+                    graph, nncf_graph, qp, node_vs_torch_annotation
+                )
+                qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp, group_size=group_size, weights_only=True)
+                self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
+        else:
+            quantization_setup = self.get_nncf_quantization_setup(model, nncf_graph)
 
-        for qp in quantization_setup.quantization_points.values():
-            edge_or_node, annotation = self._get_edge_or_node_and_annotation(
-                graph, nncf_graph, qp, node_vs_torch_annotation
-            )
-            qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_qp(qp)
-            self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
+            for qp in quantization_setup.quantization_points.values():
+                edge_or_node, annotation = self._get_edge_or_node_and_annotation(
+                    graph, nncf_graph, qp, node_vs_torch_annotation
+                )
+                qspec: QuantizationSpecBase = self._get_torch_ao_qspec_from_nncf_config(qp)
+                self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
-        for quantizer_ids in quantization_setup.unified_scale_groups.values():
+            for quantizer_ids in quantization_setup.unified_scale_groups.values():
 
-            root_quantizer_id = self._get_unified_scales_root_quantizer_id(
-                nncf_graph, quantizer_ids, quantization_setup
-            )
-            root_qp = quantization_setup.quantization_points[root_quantizer_id]
+                root_quantizer_id = self._get_unified_scales_root_quantizer_id(
+                    nncf_graph, quantizer_ids, quantization_setup
+                )
+                root_qp = quantization_setup.quantization_points[root_quantizer_id]
 
-            if any(
-                root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig
-                for q_id in quantizer_ids
-            ):
-                qps = [
-                    quantization_setup.quantization_points[q_id]
+                if any(
+                    root_qp.qconfig != quantization_setup.quantization_points[q_id].qconfig
                     for q_id in quantizer_ids
-                ]
-                msg = (
-                    "Different quantization configs are set to one unified scale group:"
-                    f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}"
+                ):
+                    qps = [
+                        quantization_setup.quantization_points[q_id]
+                        for q_id in quantizer_ids
+                    ]
+                    msg = (
+                        "Different quantization configs are set to one unified scale group:"
+                        f"{[(qp.insertion_point.__dict__, str(qp.qconfig)) for qp in qps]}"
+                    )
+                    raise nncf.InternalError(msg)
+
+                root_target_node = nncf_fx.node_utils.get_graph_node_by_name(
+                    graph, root_qp.insertion_point.target_node_name
+                )
+                root_edge_or_node = self._get_edge_or_node(
+                    root_target_node, root_qp, nncf_graph
                 )
-                raise nncf.InternalError(msg)
-
-            root_target_node = nncf_fx.node_utils.get_graph_node_by_name(
-                graph, root_qp.insertion_point.target_node_name
-            )
-            root_edge_or_node = self._get_edge_or_node(
-                root_target_node, root_qp, nncf_graph
-            )
 
-            for quantizer_id in quantizer_ids:
-                if quantizer_id == root_quantizer_id:
-                    continue
+                for quantizer_id in quantizer_ids:
+                    if quantizer_id == root_quantizer_id:
+                        continue
 
-                qspec = SharedQuantizationSpec(root_edge_or_node)
-                qp = quantization_setup.quantization_points[quantizer_id]
-                edge_or_node, annotation = self._get_edge_or_node_and_annotation(
-                    graph, nncf_graph, qp, node_vs_torch_annotation
-                )
-                self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
+                    qspec = SharedQuantizationSpec(root_edge_or_node)
+                    qp = quantization_setup.quantization_points[quantizer_id]
+                    edge_or_node, annotation = self._get_edge_or_node_and_annotation(
+                        graph, nncf_graph, qp, node_vs_torch_annotation
+                    )
+                    self._fill_torch_ao_annotation(edge_or_node, qspec, annotation)
 
         for node, annotation in node_vs_torch_annotation.items():
             assert QUANT_ANNOTATION_KEY not in node.meta
@@ -295,8 +338,8 @@ def _fill_torch_ao_annotation(
             annotation_to_update.input_qspec_map[edge_or_node[0]] = qspec
 
     @staticmethod
-    def _get_torch_ao_qspec_from_qp(
-        qp: quantization.quantizer_setup.QuantizationPointBase,
+    def _get_torch_ao_qspec_from_nncf_config(
+        qp: quantization.quantizer_setup.QuantizationPointBase, group_size=-1, weights_only=False
     ) -> QuantizationSpec:
         """
         Retrieves the quantization configuration from the given quantization point and
@@ -307,11 +350,10 @@ def _get_torch_ao_qspec_from_qp(
         """
         # Eps value is copied from nncf/torch/quantization/layers.py
         extra_args = {"eps": 1e-16}
-        qconfig = qp.qconfig
         is_weight = qp.is_weight_quantization_point()
+        qconfig = qp.qconfig
 
         observer: Type[UniformQuantizationObserverBase]
-
         if qconfig.per_channel:
             torch_qscheme = (
                 torch.per_channel_symmetric
@@ -325,11 +367,27 @@ def _get_torch_ao_qspec_from_qp(
                 else torch.per_tensor_affine
             )
         if is_weight:
-            observer = PerChannelMinMaxObserver
-            quant_min = -128
-            quant_max = 127
-            dtype = torch.int8
-            channel_axis = 0
+            mapping_type = MappingType.SYMMETRIC if qconfig.mode == QuantizationScheme.SYMMETRIC else MappingType.ASYMMETRIC
+            if qconfig.num_bits==4:
+                extra_args["mapping_type"] = mapping_type 
+                extra_args["target_dtype"] = torch.int8
+                extra_args["granularity"] = PerGroup(group_size=group_size)
+                observer = PTPerBlockParamObserver
+                quant_min = -8
+                quant_max = 7
+                dtype = torch.int8
+                channel_axis = 0
+            elif qconfig.num_bits==8:
+                observer = NNCFInt8observer if weights_only else PerChannelMinMaxObserver
+                quant_min = -128
+                quant_max = 127
+                dtype = torch.int8
+                channel_axis = 0
+                torch_qscheme = (
+                torch.per_channel_symmetric
+                if qconfig.mode is quantization.structs.QuantizationScheme.SYMMETRIC
+                else torch.per_channel_affine
+                )
         else:
             observer = (
                 HistogramObserver