xuhancn
diff --git a/‎docs/source/quantization-support.rst
+12 b/‎docs/source/quantization-support.rst
+12
diff --git a/‎mypy.ini
+3 b/‎mypy.ini
+3
diff --git a/‎test/quantization/pt2e/test_quantize_pt2e.py
+90-6 b/‎test/quantization/pt2e/test_quantize_pt2e.py
+90-6
diff --git a/‎test/test_quantization.py
+1 b/‎test/test_quantization.py
+1
diff --git a/‎torch/ao/quantization/__init__.py
+14 b/‎torch/ao/quantization/__init__.py
+14
@@ -250,6 +250,18 @@ the values observed during calibration (PTQ) or training (QAT).
     default_per_channel_weight_observer
     default_dynamic_quant_observer
     default_float_qparams_observer
+    AffineQuantizedObserverBase
+    Granularity
+    MappingType
+    PerAxis
+    PerBlock
+    PerGroup
+    PerRow
+    PerTensor
+    PerToken
+    TorchAODType
+    ZeroPointDomain
+    get_block_size
 
 torch.ao.quantization.fake_quantize
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -79,6 +79,9 @@ ignore_missing_imports = True
 [mypy-torch.ao.quantization.experimental.fake_quantize]
 ignore_missing_imports = True
 
+[mypy-torch.ao.quantization.pt2e._affine_quantization]
+ignore_errors = True
+
 #
 # Files with various errors. Mostly real errors, possibly some false
 # positives as well.
 
@@ -42,7 +42,6 @@
 )
 from torch.export import export_for_training
 from torch.fx import Node
-from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_quantization import (
     NodeSpec as ns,
     PT2EQuantizationTestCase,
@@ -1865,6 +1864,10 @@ def _get_bn_train_eval_ops(self):
             torch.ops.aten.batch_norm.default,
         )
 
+    @parametrize(
+        "device",
+        ["cpu"] + (["cuda"] if TEST_CUDA else []) + (["hpu"] if TEST_HPU else []),
+    )
     def test_move_exported_model_bn(self, device):
         """
         Test switching batch_norm behavior between train and eval modes using
@@ -2477,9 +2480,90 @@ def check_nn_module(node):
                 check_nn_module(node)
 
 
-instantiate_parametrized_tests(TestQuantizePT2E)
+@skipIfNoQNNPACK
+class TestQuantizePT2EAffineQuantization(PT2EQuantizationTestCase):
+    def test_channel_group_quantization(self):
+        from torch.ao.quantization.observer import MappingType, PerGroup, PerToken
+        from torch.ao.quantization.pt2e._affine_quantization import (
+            AffineQuantizedMinMaxObserver,
+        )
+
+        class BackendAQuantizer(Quantizer):
+            def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+                for node in model.graph.nodes:
+                    if (
+                        node.op == "call_function"
+                        and node.target == torch.ops.aten.linear.default
+                    ):
+                        input_act = node.args[0]
+                        assert isinstance(input_act, Node)
+                        weight = node.args[1]
+                        assert isinstance(weight, Node)
+
+                        act_qspec = QuantizationSpec(
+                            dtype=torch.uint8,
+                            quant_min=0,
+                            quant_max=255,
+                            qscheme=None,
+                            is_dynamic=False,
+                            observer_or_fake_quant_ctr=AffineQuantizedMinMaxObserver.with_args(
+                                # TODO: maybe align the arg name here
+                                target_dtype=torch.uint8,
+                                mapping_type=MappingType.SYMMETRIC,
+                                granularity=PerToken(),
+                            ),
+                        )
+
+                        weight_qspec = QuantizationSpec(
+                            dtype=torch.uint8,
+                            quant_min=0,
+                            quant_max=255,
+                            qscheme=None,
+                            is_dynamic=False,
+                            observer_or_fake_quant_ctr=AffineQuantizedMinMaxObserver.with_args(
+                                target_dtype=torch.uint8,
+                                mapping_type=MappingType.SYMMETRIC,
+                                granularity=PerGroup(group_size=128),
+                            ),
+                        )
+                        node.meta["quantization_annotation"] = QuantizationAnnotation(
+                            input_qspec_map={
+                                input_act: act_qspec,
+                                weight: weight_qspec,
+                            },
+                            _annotated=True,
+                        )
+
+            def validate(self, model: torch.fx.GraphModule) -> None:
+                pass
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(128, 20)
+
+            def forward(self, x):
+                return self.linear(x)
 
-devices = ["cpu", "cuda"]
-if TEST_HPU:
-    devices.append("hpu")
-instantiate_device_type_tests(TestQuantizePT2E, globals(), only_for=devices)
+        node_occurrence = {
+            torch.ops.quant.quantize_affine: 2,
+            torch.ops.quant.dequantize_affine: 2,
+        }
+        node_list = [
+            torch.ops.quant.quantize_affine,
+            torch.ops.quant.dequantize_affine,
+            torch.ops.quant.quantize_affine,
+            torch.ops.quant.dequantize_affine,
+        ]
+        example_inputs = (torch.randn(5, 128),)
+        self._test_quantizer(
+            M().eval(),
+            example_inputs,
+            BackendAQuantizer(),
+            node_occurrence,
+            node_list,
+            is_debug_mode=True,
+        )
+
+
+instantiate_parametrized_tests(TestQuantizePT2E)
@@ -87,6 +87,7 @@
     from quantization.pt2e.test_metadata_porting import TestMetaDataPorting  # noqa: F401
     from quantization.pt2e.test_numeric_debugger import TestNumericDebugger  # noqa: F401
     from quantization.pt2e.test_quantize_pt2e import TestQuantizePT2E  # noqa: F401
+    from quantization.pt2e.test_quantize_pt2e import TestQuantizePT2EAffineQuantization  # noqa: F401
     from quantization.pt2e.test_representation import TestPT2ERepresentation  # noqa: F401
     from quantization.pt2e.test_xnnpack_quantizer import TestXNNPACKQuantizer  # noqa: F401
     from quantization.pt2e.test_xnnpack_quantizer import TestXNNPACKQuantizerModels  # noqa: F401
 
@@ -168,6 +168,20 @@
     "prepare_for_propagation_comparison",
     "extract_results_from_loggers",
     "compare_results",
+    # from torchao, should be merged with torchao
+    # in the future
+    "AffineQuantizedObserverBase",
+    "Granularity",
+    "MappingType",
+    "PerAxis",
+    "PerBlock",
+    "PerGroup",
+    "PerRow",
+    "PerTensor",
+    "PerToken",
+    "TorchAODType",
+    "ZeroPointDomain",
+    "get_block_size",
 ]
Original file line number	Diff line number	Diff line change
`@@ -79,6 +79,9 @@ ignore_missing_imports = True`
`79`	`79`	`[mypy-torch.ao.quantization.experimental.fake_quantize]`
`80`	`80`	`ignore_missing_imports = True`
`81`	`81`
	`82`	`+[mypy-torch.ao.quantization.pt2e._affine_quantization]`
	`83`	`+ignore_errors = True`
	`84`	`+`
`82`	`85`	`#`
`83`	`86`	`# Files with various errors. Mostly real errors, possibly some false`
`84`	`87`	`# positives as well.`