From 69e31a07929232916d1e6ad78a6fdd81aac18ace Mon Sep 17 00:00:00 2001 From: jainapurva Date: Mon, 12 Aug 2024 14:12:50 -0700 Subject: [PATCH 1/2] Spelling fixes for inpt_tensor to input_tensor --- torchao/dtypes/nf4tensor.py | 88 ++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/torchao/dtypes/nf4tensor.py b/torchao/dtypes/nf4tensor.py index 56feb18572..90516ea199 100644 --- a/torchao/dtypes/nf4tensor.py +++ b/torchao/dtypes/nf4tensor.py @@ -387,22 +387,22 @@ class SubclassTensorArgs: requires_grad: bool -def get_block_absmax(inpt_tensor: torch.Tensor, block_size: int) -> torch.Tensor: +def get_block_absmax(input_tensor: torch.Tensor, block_size: int) -> torch.Tensor: """Iterate through a flattened tensor getting the absmax scalers for each block Args: - inpt_tensor: Input tensor to get scalers for + input_tensor: Input tensor to get scalers for block_size: Block size for the scanning window Returns: torch.Tensor: Tensor of scalers for each block """ - assert inpt_tensor.dim() == 1, "Input tensor must be flattened" + assert input_tensor.dim() == 1, "Input tensor must be flattened" assert ( - inpt_tensor.numel() % block_size - ) == 0, f"Input tensor must be divisible by block size, got {inpt_tensor.numel()} and {block_size}" + input_tensor.numel() % block_size + ) == 0, f"Input tensor must be divisible by block size, got {input_tensor.numel()} and {block_size}" - n_blocks = inpt_tensor.numel() // block_size - blocks = inpt_tensor.view(n_blocks, block_size) + n_blocks = input_tensor.numel() // block_size + blocks = input_tensor.view(n_blocks, block_size) block_scalers = blocks.abs().max(dim=1).values return block_scalers @@ -478,18 +478,18 @@ def __init__( @torch.no_grad() def from_tensor( cls, - inpt_tensor: torch.Tensor, + input_tensor: torch.Tensor, block_size: int, scaler_block_size: int, ): - assert inpt_tensor.dim() <= 2, f"expect input tensor dim <= 2 but got dim = {inpt_tensor.dim()}" + assert input_tensor.dim() <= 2, f"expect input tensor dim <= 2 but got dim = {input_tensor.dim()}" assert ( - inpt_tensor.numel() % block_size == 0 - ), f"Input tensor must be divisible by block size, got {inpt_tensor.numel()} and {block_size}" - assert inpt_tensor.is_contiguous, "Input tensor must be contiguous!" + input_tensor.numel() % block_size == 0 + ), f"Input tensor must be divisible by block size, got {input_tensor.numel()} and {block_size}" + assert input_tensor.is_contiguous, "Input tensor must be contiguous!" # I think I want do this - # assert not inpt_tensor.requires_grad, "Input tensor must not require grad" - device = inpt_tensor.device + # assert not input_tensor.requires_grad, "Input tensor must not require grad" + device = input_tensor.device # Cache the tensor on the class def nf4 = torch.tensor( [ @@ -511,27 +511,27 @@ def from_tensor( 1.0000, ], device=device, - dtype=inpt_tensor.dtype, + dtype=input_tensor.dtype, ) - n_blocks = inpt_tensor.numel() // block_size + n_blocks = input_tensor.numel() // block_size # Double quantization ( quantized_scalers, quantization_factor, scaler_mean, ) = cls.double_quantize_scalers( - inpt_tensor.flatten(), block_size, scaler_block_size + input_tensor.flatten(), block_size, scaler_block_size ) quantized_data = cls.convert_to_norm_float_weight( - inpt_tensor, n_blocks, block_size, nf4 + input_tensor, n_blocks, block_size, nf4 ) tensor_meta = SubclassTensorArgs( - inpt_tensor.size(), - inpt_tensor.stride(), - inpt_tensor.storage_offset(), - inpt_tensor.dtype, - inpt_tensor.device, - inpt_tensor.requires_grad, + input_tensor.size(), + input_tensor.stride(), + input_tensor.storage_offset(), + input_tensor.dtype, + input_tensor.device, + input_tensor.requires_grad, ) return cls( tensor_meta, @@ -547,7 +547,7 @@ def from_tensor( @staticmethod def double_quantize_scalers( - inpt_tensor: torch.Tensor, + input_tensor: torch.Tensor, block_size: int, scaler_block_size: int, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: @@ -557,7 +557,7 @@ def double_quantize_scalers( And then we calculate the absmax quantization factors for each block again. We then quantize the scalers to int8. Args: - inpt_tensor: Input tensor to convert to QLoRA format, typically a weight tensor + input_tensor: Input tensor to convert to QLoRA format, typically a weight tensor Returns: torch.Tensor: Tensor of per_block quantization factors stored in int8 format @@ -565,14 +565,14 @@ def double_quantize_scalers( torch.Tensor: Tensor of per_scaler_block quantization factors stored in int16 format size: (n_scaler_blocks) """ - assert inpt_tensor.dim() == 1, "Input tensor must be flattened" + assert input_tensor.dim() == 1, "Input tensor must be flattened" assert ( - inpt_tensor.numel() % scaler_block_size - ) == 0, f"Input tensor must be divisible by block size, got {inpt_tensor.numel()} and {scaler_block_size}" + input_tensor.numel() % scaler_block_size + ) == 0, f"Input tensor must be divisible by block size, got {input_tensor.numel()} and {scaler_block_size}" # First round of quantization - # Produces: A tensor of size (n_blocks) of inpt_tensor.dtype - scalers_1 = get_block_absmax(inpt_tensor, block_size) + # Produces: A tensor of size (n_blocks) of input_tensor.dtype + scalers_1 = get_block_absmax(input_tensor, block_size) scalers_1_mean = scalers_1.mean() scalers_1 = scalers_1 - scalers_1_mean # Second round of quantization @@ -607,38 +607,38 @@ def double_quantize_scalers( def dequantize_scalers( self, - inpt_tensor: torch.Tensor, + input_tensor: torch.Tensor, quantization_factor: torch.Tensor, scaler_block_size: int, ) -> torch.Tensor: """Used to unpack the double quantized scalers Args; - inpt_tensor: Input tensor to convert to QLoRA format this is the quantized scalers in int8 format + input_tensor: Input tensor to convert to QLoRA format this is the quantized scalers in int8 format quantization_factor: Tensor of per_scaler_block quantization factors stored in inpt_weight.dtype size: (n_scaler_blocks) scaler_block_size: Scaler block size to use for double quantization. """ - assert inpt_tensor.dim() == 1, "Input tensor must be flattened" + assert input_tensor.dim() == 1, "Input tensor must be flattened" assert ( - inpt_tensor.numel() % scaler_block_size - ) == 0, f"Input tensor must be divisible by block size, got {inpt_tensor.numel()} and {scaler_block_size}" - n_scaler_blocks = inpt_tensor.numel() // scaler_block_size - inpt_tensor = inpt_tensor.view(n_scaler_blocks, scaler_block_size) - dequantized = (inpt_tensor / quantization_factor.unsqueeze(-1)).flatten().to( + input_tensor.numel() % scaler_block_size + ) == 0, f"Input tensor must be divisible by block size, got {input_tensor.numel()} and {scaler_block_size}" + n_scaler_blocks = input_tensor.numel() // scaler_block_size + input_tensor = input_tensor.view(n_scaler_blocks, scaler_block_size) + dequantized = (input_tensor / quantization_factor.unsqueeze(-1)).flatten().to( self.dtype ) + self.scaler_mean return dequantized @staticmethod def convert_to_norm_float_weight( - inpt_tensor: torch.Tensor, n_blocks: int, block_size: int, nf4: torch.Tensor + input_tensor: torch.Tensor, n_blocks: int, block_size: int, nf4: torch.Tensor ) -> torch.Tensor: """Convert a tensor to the normalized float weight format""" - flattened_tensor = inpt_tensor.flatten() + flattened_tensor = input_tensor.flatten() # Since we are using uint8 we will encode 2 entries per byte - numel = inpt_tensor.numel() + numel = input_tensor.numel() assert ( numel % 2 == 0 ), "Number of elements must be even just to not have to think about the end" @@ -646,13 +646,13 @@ def convert_to_norm_float_weight( blocks = flattened_tensor.view(n_blocks, block_size) # Scale the blocks - scalers = get_block_absmax(inpt_tensor.flatten(), block_size) + scalers = get_block_absmax(input_tensor.flatten(), block_size) scales = scalers.unsqueeze(-1).expand(n_blocks, block_size) scaled_blocks = blocks / scales # Returns a flattened tensor with each element quantized to nf4 index # See Note: Quantize in Chunks - quantized_blocks = torch.empty(numel, dtype=torch.uint8, device=inpt_tensor.device) + quantized_blocks = torch.empty(numel, dtype=torch.uint8, device=input_tensor.device) flattened = scaled_blocks.flatten() for chunk_num in range(math.ceil(numel / CHUNK_SIZE)): start = chunk_num * CHUNK_SIZE From 7bcb83014c904c539ea7e7463438e3c69843e140 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Mon, 12 Aug 2024 14:16:04 -0700 Subject: [PATCH 2/2] inpt_tensor -> input_tensor --- test/dtypes/test_nf4.py | 52 ++++++++++++++++++------------------- torchao/float8/inference.py | 12 ++++----- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/test/dtypes/test_nf4.py b/test/dtypes/test_nf4.py index 1cdf2708a0..c915bdfb11 100644 --- a/test/dtypes/test_nf4.py +++ b/test/dtypes/test_nf4.py @@ -157,10 +157,10 @@ def test_nf4_bnb_linear(self, dtype: torch.dtype): @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32]) def test_load_from_state_dicts(self, dtype: torch.dtype): """Tests loading to and from different module state dicts""" - inpt_tensor = torch.rand(64, device='cuda', dtype=dtype) - base_mod = self.TestMod(inpt_tensor, 32, 2) + input_tensor = torch.rand(64, device='cuda', dtype=dtype) + base_mod = self.TestMod(input_tensor, 32, 2) - dummy_dict = {"param": inpt_tensor} + dummy_dict = {"param": input_tensor} base_mod.load_state_dict(dummy_dict) assert base_mod.param.block_size == 32 @@ -170,12 +170,12 @@ def test_load_from_state_dicts(self, dtype: torch.dtype): @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32]) def test_load_from_nf4_same_meta(self, dtype: torch.dtype): """Tests loading to and from different module state dicts""" - inpt_tensor = torch.rand(64, device='cuda', dtype=dtype) - base_mod = self.TestMod(inpt_tensor, 32, 2) + input_tensor = torch.rand(64, device='cuda', dtype=dtype) + base_mod = self.TestMod(input_tensor, 32, 2) state_dict = base_mod.state_dict() saved_state_dict = self.save_state_dict_to_buffer(state_dict) - other_mod = self.TestMod(inpt_tensor, 32, 2) + other_mod = self.TestMod(input_tensor, 32, 2) other_mod.load_state_dict(torch.load(saved_state_dict)) assert other_mod.param.block_size == 32 assert other_mod.param.scaler_block_size == 2 @@ -184,50 +184,50 @@ def test_load_from_nf4_same_meta(self, dtype: torch.dtype): @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32]) def test_load_from_nf4_diff_meta(self, dtype: torch.dtype): """Tests loading to and from different module state dicts""" - inpt_tensor = torch.rand(128, device='cuda', dtype=dtype) - base_mod = self.TestMod(inpt_tensor, 32, 2) + input_tensor = torch.rand(128, device='cuda', dtype=dtype) + base_mod = self.TestMod(input_tensor, 32, 2) state_dict = base_mod.state_dict() saved_state_dict = self.save_state_dict_to_buffer(state_dict) - other_mod = self.TestMod(inpt_tensor, 64, 1) + other_mod = self.TestMod(input_tensor, 64, 1) other_mod.load_state_dict(torch.load(saved_state_dict)) assert other_mod.param.block_size == 64 assert other_mod.param.scaler_block_size == 1 @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32]) def test_to_copy(self, dtype: torch.dtype): - inpt_tensor = torch.rand(128, device='cpu') - inpt_tensor_nf4 = to_nf4(inpt_tensor, 32, 2) - nf4_to_dtype = inpt_tensor_nf4.to(dtype) - torch.testing.assert_allclose(inpt_tensor, nf4_to_dtype, atol=0.13, rtol=0.13) + input_tensor = torch.rand(128, device='cpu') + input_tensor_nf4 = to_nf4(input_tensor, 32, 2) + nf4_to_dtype = input_tensor_nf4.to(dtype) + torch.testing.assert_allclose(input_tensor, nf4_to_dtype, atol=0.13, rtol=0.13) if torch.cuda.is_available(): - inpt_tensor = torch.rand(128, device='cuda') - inpt_tensor_nf4 = to_nf4(inpt_tensor, 32, 2) - nf4_to_dtype = inpt_tensor_nf4.to(dtype) - torch.testing.assert_allclose(inpt_tensor, nf4_to_dtype, atol=0.13, rtol=0.13) + input_tensor = torch.rand(128, device='cuda') + input_tensor_nf4 = to_nf4(input_tensor, 32, 2) + nf4_to_dtype = input_tensor_nf4.to(dtype) + torch.testing.assert_allclose(input_tensor, nf4_to_dtype, atol=0.13, rtol=0.13) @unittest.skipIf(not torch.cuda.is_available(), "Need cuda for test") def test_to_copy_device(self): - inpt_tensor = torch.rand(128, device='cpu') - t = to_nf4(inpt_tensor, 32, 2) + input_tensor = torch.rand(128, device='cpu') + t = to_nf4(input_tensor, 32, 2) assert t.device == torch.device('cpu') z = t.cuda() assert z.device.type == "cuda" # Because the device could be cuda:0 x = z.cpu() assert x.device == torch.device('cpu') - inpt_tensor = torch.rand(128, device='cuda') - t = to_nf4(inpt_tensor, 32, 2) + input_tensor = torch.rand(128, device='cuda') + t = to_nf4(input_tensor, 32, 2) assert t.device.type == "cuda" @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32]) def test_to_dtype(self, dtype: torch.dtype): - inpt_tensor = torch.rand(128, dtype=dtype) - inpt_tensor_nf4 = to_nf4(inpt_tensor, 32, 2) - assert type(inpt_tensor_nf4) != torch.Tensor - assert type(inpt_tensor_nf4.to(dtype)) == torch.Tensor - assert inpt_tensor_nf4.to(dtype).dtype == dtype + input_tensor = torch.rand(128, dtype=dtype) + input_tensor_nf4 = to_nf4(input_tensor, 32, 2) + assert type(input_tensor_nf4) != torch.Tensor + assert type(input_tensor_nf4.to(dtype)) == torch.Tensor + assert input_tensor_nf4.to(dtype).dtype == dtype @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32]) diff --git a/torchao/float8/inference.py b/torchao/float8/inference.py index f5c5045030..f441009c48 100644 --- a/torchao/float8/inference.py +++ b/torchao/float8/inference.py @@ -174,7 +174,7 @@ def from_float( def cast_to_float8_e4m3_inference( - inpt_tensor: torch.Tensor, + input_tensor: torch.Tensor, linear_mm_config: LinearMMConfig, reduce_amax: bool = False, static_quantization_scale: Optional[torch.Tensor] = None, @@ -182,7 +182,7 @@ def cast_to_float8_e4m3_inference( """Casts an input tensor to the Float8 (e4m3fn*) Args: - inpt_tensor: The input tensor to be cast. + input_tensor: The input tensor to be cast. linear_mm_config: Configuration settings for the matrix multiplication reduce_amax: Whether to reduce the amax (absolute maximum) among the local distributed group. static_quantization_scale: Optional tensor specifying the scale for activation. Default is None. @@ -193,15 +193,15 @@ def cast_to_float8_e4m3_inference( Note: If the input tensor is already in Float8 format, it is returned as is without re-casting. """ - if tensor_already_casted_to_fp8(inpt_tensor): - return inpt_tensor + if tensor_already_casted_to_fp8(input_tensor): + return input_tensor scale = ( static_quantization_scale if static_quantization_scale is not None - else tensor_to_scale(inpt_tensor, e4m3_dtype, reduce_amax) + else tensor_to_scale(input_tensor, e4m3_dtype, reduce_amax) ) return hp_tensor_and_scale_to_float8( - inpt_tensor, + input_tensor, scale, e4m3_dtype, linear_mm_config,