Revert "Arm backend: Move rescales from SUB visitor to pass"

Martin Lindström · Martin Lindström · commit 0b65f38c3ac7 · 2025-10-07T10:38:30.000+02:00
This reverts commit f21cf7f.
diff --git a/backends/arm/_passes/insert_rescales_pass.py b/backends/arm/_passes/insert_rescales_pass.py
@@ -93,7 +93,6 @@ class InsertRescaleInt32Pass(ArmPass):
         exir_ops.edge.aten.lt.Tensor,
         exir_ops.edge.aten.maximum.default,
         exir_ops.edge.aten.minimum.default,
-        exir_ops.edge.aten.sub.Tensor,
     ]
 
     def _int32_qargs(self, s):
@@ -134,33 +133,6 @@ def _get_inputs_rescaled_qparams(
             qparams = {
                 i: self._int32_qargs(min_scale) for i in range(len(input_qparams))
             }
-        elif target in [
-            exir_ops.edge.aten.sub.Tensor,
-        ]:
-            if input_qparams[0].dtype != input_qparams[1].dtype:
-                raise ValueError(
-                    "Mismatch in dtype args: {input_qparams[0].dtype} != {input_qparams[1].dtype}"
-                )
-
-            # We are handling two INT8 or two INT16 numbers. For INT8, if the
-            # zero point is non-null, the result will be in the range [-255;
-            # 255], therefore we need 9 bits for the result. We have a 32-bit
-            # accumulator, so we can divide the scale by (1 << 20) which is
-            # equivalent to shifting the INT8 operands 20 bits to the left
-            # before rescaling them both to 2 * max(lhs, rhs).
-            #
-            # For INT16, similary logic can be applied, but we instead end up
-            # with a left shift of 12.
-            lhs_scale, rhs_scale = (
-                qp.get_scale_per_tensor() for qp in input_qparams.values()
-            )
-            max_scale_2x = 2 * max(lhs_scale, rhs_scale)
-
-            # Select shift based on input dtype.
-            shift_bits = 12 if input_qparams[0].dtype == torch.int16 else 20
-
-            scale = max_scale_2x / (1 << shift_bits)
-            qparams = {i: self._int32_qargs(scale) for i in range(len(input_qparams))}
         else:
             raise ValueError(f"Not a valid target: {target}")
 
@@ -176,7 +148,6 @@ def _get_output_qparams(
             exir_ops.edge.aten.abs.default,
             exir_ops.edge.aten.maximum.default,
             exir_ops.edge.aten.minimum.default,
-            exir_ops.edge.aten.sub.Tensor,
         ]:
             # The op has not altered the scale; the output scale is equal to
             # the operands' scales.
@@ -216,7 +187,7 @@ def _rescale_inputs(self, graph, node, rescale_qargs: Dict[int, QuantArgs]) -> b
         modified = False
         for i in qargs:
             qp = qargs[i]
-            if qp.dtype not in (torch.int8, torch.int16):
+            if qp.dtype != torch.int8:
                 continue
 
             arg_node = args_copy[i]
@@ -255,7 +226,7 @@ def _rescale_outputs(self, graph, node, rescale_qargs: Optional[QuantArgs]) -> b
         assert rescale_qargs is not None
 
         qarg = qargs[0]
-        if qarg.dtype not in (torch.int8, torch.int16):
+        if qarg.dtype != torch.int8:
             return False
 
         users_copy = list(node.users)
@@ -266,7 +237,7 @@ def _rescale_outputs(self, graph, node, rescale_qargs: Optional[QuantArgs]) -> b
                 exir_ops.backend.tosa.RESCALE.default,
                 (
                     node,
-                    qarg.dtype,
+                    torch.int8,
                     rescale_qargs.get_scale_per_tensor()
                     / qarg.get_scale_per_tensor(),  # Old scale / new scale
                     rescale_qargs.get_zp_per_tensor(),  # Old zero point
diff --git a/backends/arm/operators/op_sub.py b/backends/arm/operators/op_sub.py
@@ -7,6 +7,9 @@
 
 from typing import Any, List
 
+import executorch.backends.arm.tosa.quant_utils as tqutils
+import executorch.backends.arm.tosa.utils as tutils
+
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -16,20 +19,22 @@
     validate_same_dtype,
     validate_valid_dtype,
 )
+from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.mapping import TosaArg
-from executorch.backends.arm.tosa.specification import TosaSpecification
 from torch.fx import Node
 
 
 @register_node_visitor
-class SubVisitor(NodeVisitor):
+class SubVisitor_INT(NodeVisitor):
     target = "aten.sub.Tensor"
 
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-1.0+INT"),
-        TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
+    def __init__(self, *args):
+        super().__init__(*args)
+
     def define_node(
         self,
         node: Node,
@@ -45,18 +50,105 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [*inputs, output],
-            [ts.DType.INT32, ts.DType.FP32],
+            [ts.DType.INT8, ts.DType.INT16, ts.DType.INT32],
             output.tosa_spec,
         )
 
+        scale_back = 1.0
+        if inputs[0].dtype == ts.DType.INT8:
+            rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32_maxscale(
+                tosa_graph, inputs, node, self.tosa_spec
+            )
+        elif inputs[0].dtype == ts.DType.INT16:
+            rescaled_inputs, scale_back = (
+                tqutils.insert_rescale_ops_int16_to_int32_maxscale(
+                    tosa_graph, inputs, node, self.tosa_spec
+                )
+            )
+        else:
+            # input[0].dtype == ts.DType.INT32
+            # Non quantized input, natively support by TOSA.SUB
+            rescaled_inputs = inputs
+
+        if output.dtype in [ts.DType.INT8, ts.DType.INT16]:
+            broadcasted_shape = tutils.tosa_shape(output.shape, output.dim_order)
+            sub_output = tosa_graph.addIntermediate(broadcasted_shape, ts.DType.INT32)
+        else:
+            # output.dtype == ts.DType.INT32
+            sub_output = output
+
+        # Do the INT32 Sub
         self._serialize_operator(
             node,
             tosa_graph,
             ts.TosaOp.Op().SUB,
             [
-                inputs[0].name,
-                inputs[1].name,
+                rescaled_inputs[0].name,
+                rescaled_inputs[1].name,
             ],
-            [output.name],
+            [sub_output.name],
             None,
         )
+
+        if output.dtype == ts.DType.INT8:
+            # Scale output back to 8 bit
+            # pyre-ignore
+            tqutils.insert_rescale_op_to_int8(
+                tosa_graph,
+                sub_output,
+                scale_back,
+                node,
+                compute_rescale=False,
+                tosa_spec=self.tosa_spec,
+            )  # type: ignore[possibly-undefined]
+        elif output.dtype == ts.DType.INT16:
+            tqutils.insert_rescale_op_to_int16(
+                tosa_graph,
+                sub_output,
+                scale_back,
+                node,
+                compute_rescale=False,
+                tosa_spec=self.tosa_spec,
+            )  # type: ignore[possibly-undefined]
+
+
+@register_node_visitor
+class SubVisitor_FP(SubVisitor_INT):
+    # inheriting 'target' from INT class
+
+    tosa_specs = [TosaSpecification.create_from_string("TOSA-1.0+FP")]
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: Any,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+
+        import serializer.tosa_serializer as ts  # type: ignore
+
+        validate_num_inputs(self.target, inputs, 2)
+        validate_same_dtype(self.target, [*inputs, output], ts)
+
+        if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
+            # Call the inherited define_node for handling integers
+            super().define_node(node, tosa_graph, inputs, output)
+        else:
+            # FP32 Sub lowering
+            validate_valid_dtype(
+                self.target, [*inputs, output], ts.DType.FP32, output.tosa_spec
+            )
+
+            # MI lowering
+            self._serialize_operator(
+                node,
+                tosa_graph,
+                ts.TosaOp.Op().SUB,
+                [inputs[0].name, inputs[1].name],
+                [output.name],
+                None,
+            )
diff --git a/backends/arm/test/passes/test_insert_rescale_i32_pass.py b/backends/arm/test/passes/test_insert_rescale_i32_pass.py
@@ -22,11 +22,10 @@ def __init__(self):
         super().__init__()
 
     def forward(self, x, y):
-        a = x - y
-        c = torch.maximum(a, y)
-        d = torch.abs(c)
-        e = d > c
-        return e
+        a = torch.maximum(x, y)
+        b = torch.abs(a)
+        c = a > b
+        return c
 
     def get_inputs(self, dtype) -> input_t:
         if dtype == torch.float32:
@@ -46,8 +45,8 @@ def test_insert_rescales():
     ops_not_before = {"executorch_exir_dialects_backend__ops_tosa_RESCALE_default"}
     ops_after = {
         # "number of op nodes with i8 output" + "number of i8 node inputs"
-        "executorch_exir_dialects_backend__ops_tosa_RESCALE_default": 3
-        + 7,
+        "executorch_exir_dialects_backend__ops_tosa_RESCALE_default": 2
+        + 5,
     }
     pipeline = PassPipeline[input_t](
         module,