[subclasses] Use __slots__ for micro optim of flatten/unflatten

IvanKobzarev · IvanKobzarev · commit a2d0045359b2 · 2024-11-01T05:10:39.000-07:00
ghstack-source-id: 29e856540122dd6d0a8d3a522617234af70a6ca3 Pull Request resolved: #1211
diff --git a/torchao/dtypes/nf4tensor.py b/torchao/dtypes/nf4tensor.py
@@ -455,6 +455,17 @@ def get_block_absmax(input_tensor: torch.Tensor, block_size: int) -> torch.Tenso
 class NF4Tensor(torch.Tensor):
     """NF4Tensor class for converting a weight to the QLoRA NF4 format"""
 
+    __slots__ = [
+        "quantized_data",
+        "scaler_mean",
+        "quantization_factor",
+        "quantized_scalers",
+        "nf4",
+        "block_size",
+        "n_blocks",
+        "scaler_block_size",
+    ]
+
     @torch._dynamo.disable
     def __new__(
         cls,
diff --git a/torchao/float8/fsdp_utils.py b/torchao/float8/fsdp_utils.py
@@ -128,6 +128,9 @@ def precompute_float8_dynamic_scale_for_fsdp(module: nn.Module) -> None:
 #      |   TP compute with torch.mm(input, weight)
 
 class WeightWithDynamicFloat8CastTensor(torch.Tensor):
+
+    __slots__ = "_tensor", "_precomputed_scale", "_linear_mm_config"
+
     @staticmethod
     def __new__(
         cls,
@@ -258,6 +261,16 @@ def fsdp_post_all_gather(
 
 
 class WeightWithDelayedFloat8CastTensor(torch.Tensor):
+
+    __slots__ = [
+        "_tensor",
+        "_amax_buffer",
+        "_amax_history_buffer",
+        "_scale_buffer",
+        "_linear_mm_config",
+        "is_amax_initialized"
+    ]
+
     @staticmethod
     def __new__(
         cls,
@@ -439,6 +452,9 @@ def fsdp_post_all_gather(
 
 
 class WeightWithStaticFloat8CastTensor(torch.Tensor):
+
+    __slots__ = "_tensor", "_static_scale", "_linear_mm_config"
+
     @staticmethod
     def __new__(
         cls,