@@ -508,8 +508,9 @@ def update_resources(self, scheduled_batch: ScheduledRequests):
508508 def free_resources (self , request : LlmRequest ):
509509 self .impl .remove_sequence (request .py_request_id , request )
510510
511+ @staticmethod
511512 def calculate_scaling_factor_size_bytes (
512- self , cache_size : int , quant_vector_size : int ,
513+ cache_size : int , quant_vector_size : int ,
513514 scaling_factor_dtype : DataType ) -> int :
514515 assert cache_size % quant_vector_size == 0 , "NVFP4 cache size must be divisible by quant vector size"
515516 return get_size_in_bytes (cache_size // quant_vector_size ,
@@ -733,7 +734,7 @@ def calculate_cache_size_per_token(layers: Set[int]) -> int:
733734 cache_size_bytes_per_token = get_size_in_bytes (
734735 cache_size_per_token , dtype )
735736 if dtype == DataType .NVFP4 :
736- cache_size_bytes_per_token += self .calculate_scaling_factor_size_bytes (
737+ cache_size_bytes_per_token += KVCacheManager .calculate_scaling_factor_size_bytes (
737738 cache_size_per_token ,
738739 quant_vector_size = 16 ,
739740 scaling_factor_dtype = DataType .FP8 )
@@ -766,7 +767,7 @@ def calculate_cache_size_per_token(layers: Set[int]) -> int:
766767 cache_size_bytes_per_token = get_size_in_bytes (
767768 cache_size_per_token , dtype )
768769 if dtype == DataType .NVFP4 :
769- cache_size_bytes_per_token += self .calculate_scaling_factor_size_bytes (
770+ cache_size_bytes_per_token += KVCacheManager .calculate_scaling_factor_size_bytes (
770771 cache_size_per_token ,
771772 quant_vector_size = 16 ,
772773 scaling_factor_dtype = DataType .FP8 )
0 commit comments