diff --git a/csrc/attention/attention_kernels.cuh b/csrc/attention/attention_kernels.cuh index 563e1438f0b0..23d3ee4ea4b5 100644 --- a/csrc/attention/attention_kernels.cuh +++ b/csrc/attention/attention_kernels.cuh @@ -24,6 +24,10 @@ #include "attention_dtypes.h" #include "attention_utils.cuh" +#include +#include +#include "dtype_fp8.cuh" +#include "../quantization/int8_kvcache/quant_utils.cuh" #ifdef USE_ROCM #include @@ -105,7 +109,10 @@ __device__ void paged_attention_kernel( const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, const int kv_block_stride, const int kv_head_stride, - const float k_scale, const float v_scale, const int tp_rank, + const int quant_group, + const float* __restrict__ k_scales, + const float* __restrict__ v_scales, + const int tp_rank, const int blocksparse_local_blocks, const int blocksparse_vert_stride, const int blocksparse_block_size, const int blocksparse_head_sliding_step) { const int seq_idx = blockIdx.y; @@ -151,6 +158,16 @@ __device__ void paged_attention_kernel( const int num_heads = gridDim.x; const int num_queries_per_kv = num_heads / num_kv_heads; const int kv_head_idx = head_idx / num_queries_per_kv; + float k_scale = 0; + float v_scale = 0; + if constexpr (KV_DTYPE == Fp8KVCacheDataType::kInt8Group128) { + int64_t tgt_kvs_idx = floor((kv_head_idx*HEAD_SIZE)/quant_group); + k_scale = *reinterpret_cast(k_scales+tgt_kvs_idx); + v_scale = *reinterpret_cast(v_scales+tgt_kvs_idx); + } else { + k_scale = *reinterpret_cast(k_scales); + v_scale = *reinterpret_cast(v_scales); + } const float alibi_slope = alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx]; @@ -280,6 +297,17 @@ __device__ void paged_attention_kernel( if constexpr (KV_DTYPE == Fp8KVCacheDataType::kAuto) { k_vecs[j] = *reinterpret_cast( k_ptr + offset1 * BLOCK_SIZE * x + offset2); + // int8 kv-cache + } else if constexpr (KV_DTYPE == Fp8KVCacheDataType::kInt8Group0) { + Quant_vec k_vec_quant = *reinterpret_cast( + k_ptr + offset1 * BLOCK_SIZE * x + offset2); + k_vecs[j] = int8::scaled_vec_conversion_int8( + k_vec_quant, k_scale, 0); + } else if constexpr (KV_DTYPE == Fp8KVCacheDataType::kInt8Group128) { + Quant_vec k_vec_quant = *reinterpret_cast( + k_ptr + offset1 * BLOCK_SIZE * x + offset2); + k_vecs[j] = int8::scaled_vec_conversion_int8( + k_vec_quant, k_scale, 0); } else { // Vector conversion from Quant_vec to K_vec. Quant_vec k_vec_quant = *reinterpret_cast( @@ -410,6 +438,21 @@ __device__ void paged_attention_kernel( if constexpr (KV_DTYPE == Fp8KVCacheDataType::kAuto) { v_vec = *reinterpret_cast(v_ptr + offset); + // int8 kv-cache + } else if constexpr (KV_DTYPE == Fp8KVCacheDataType::kInt8Group0) { + V_quant_vec v_quant_vec = + *reinterpret_cast(v_ptr + offset); + // Vector conversion from V_quant_vec to V_vec. + v_vec = int8::scaled_vec_conversion_int8(v_quant_vec, + v_scale, + 0); + } else if constexpr (KV_DTYPE == Fp8KVCacheDataType::kInt8Group128) { + V_quant_vec v_quant_vec = + *reinterpret_cast(v_ptr + offset); + // Vector conversion from V_quant_vec to V_vec. + v_vec = int8::scaled_vec_conversion_int8(v_quant_vec, + v_scale, + 0); } else { V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset); @@ -513,7 +556,10 @@ __global__ void paged_attention_v1_kernel( const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, const int kv_block_stride, const int kv_head_stride, - const float k_scale, const float v_scale, const int tp_rank, + const int quant_group, + const float* __restrict__ k_scales, + const float* __restrict__ v_scales, + const int tp_rank, const int blocksparse_local_blocks, const int blocksparse_vert_stride, const int blocksparse_block_size, const int blocksparse_head_sliding_step) { paged_attention_kernel( exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes, q_stride, - kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank, + kv_block_stride, kv_head_stride, quant_group, k_scales, v_scales, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride, blocksparse_block_size, blocksparse_head_sliding_step); } diff --git a/csrc/attention/dtype_float16.cuh b/csrc/attention/dtype_float16.cuh index 3a1815f0ed4f..e634c573e65c 100644 --- a/csrc/attention/dtype_float16.cuh +++ b/csrc/attention/dtype_float16.cuh @@ -66,6 +66,10 @@ template <> struct FloatVec { using Type = Float8_; }; +template<> +struct FloatVec { + using Type = float; +}; // Utility functions for type conversions. inline __device__ uint32_t h0_h0(uint16_t a) { diff --git a/csrc/attention/dtype_fp8.cuh b/csrc/attention/dtype_fp8.cuh index e714e321b0be..b0d2713afde1 100644 --- a/csrc/attention/dtype_fp8.cuh +++ b/csrc/attention/dtype_fp8.cuh @@ -15,6 +15,10 @@ enum class Fp8KVCacheDataType { kAuto = 0, kFp8E4M3 = 1, kFp8E5M2 = 2, + // Layerwise int8 kv cache + kInt8Group0 = 3, + // Groupwise int8 kv cache + kInt8Group128 = 4, }; // fp8 vector types for quantization of kv cache diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu index 27321148f6dd..03a894ff4a77 100644 --- a/csrc/attention/paged_attention_v1.cu +++ b/csrc/attention/paged_attention_v1.cu @@ -41,7 +41,8 @@ out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \ scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \ alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \ - k_scale, v_scale, tp_rank, blocksparse_local_blocks, \ + quant_group, k_scales_ptr, v_scales_ptr, \ + tp_rank, blocksparse_local_blocks, \ blocksparse_vert_stride, blocksparse_block_size, \ blocksparse_head_sliding_step); @@ -53,8 +54,11 @@ void paged_attention_v1_launcher( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const std::optional& alibi_slopes, float k_scale, - float v_scale, const int tp_rank, const int blocksparse_local_blocks, + const c10::optional& alibi_slopes, + int quant_group, + torch::Tensor& k_scales, + torch::Tensor& v_scales, + const int tp_rank, const int blocksparse_local_blocks, const int blocksparse_vert_stride, const int blocksparse_block_size, const int blocksparse_head_sliding_step) { int num_seqs = query.size(0); @@ -78,6 +82,8 @@ void paged_attention_v1_launcher( T* query_ptr = reinterpret_cast(query.data_ptr()); CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); + float* k_scales_ptr = reinterpret_cast(k_scales.data_ptr()); + float* v_scales_ptr = reinterpret_cast(v_scales.data_ptr()); int* block_tables_ptr = block_tables.data_ptr(); int* seq_lens_ptr = seq_lens.data_ptr(); @@ -135,10 +141,12 @@ void paged_attention_v1_launcher( paged_attention_v1_launcher( \ out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \ - seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank, \ + seq_lens, max_seq_len, alibi_slopes, quant_group, k_scales, v_scales, \ + tp_rank, \ blocksparse_local_blocks, blocksparse_vert_stride, \ blocksparse_block_size, blocksparse_head_sliding_step); + #define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ if (is_block_sparse) { \ CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \ @@ -176,8 +184,11 @@ void paged_attention_v1( torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] torch::Tensor& seq_lens, // [num_seqs] int64_t block_size, int64_t max_seq_len, - const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, + const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, + const int64_t quant_group, + torch::Tensor& k_scales, + torch::Tensor& v_scales, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step) { @@ -190,4 +201,4 @@ void paged_attention_v1( #undef WARP_SIZE #undef MAX #undef MIN -#undef DIVIDE_ROUND_UP \ No newline at end of file +#undef DIVIDE_ROUND_UP diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu index a453b2243e48..d18ec4243706 100644 --- a/csrc/attention/paged_attention_v2.cu +++ b/csrc/attention/paged_attention_v2.cu @@ -37,7 +37,8 @@ exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \ value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \ seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \ - kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank, \ + kv_block_stride, kv_head_stride, \ + quant_group, k_scales_ptr, v_scales_ptr, tp_rank, \ blocksparse_local_blocks, blocksparse_vert_stride, \ blocksparse_block_size, blocksparse_head_sliding_step); \ vllm::paged_attention_v2_reduce_kernel& alibi_slopes, float k_scale, - float v_scale, const int tp_rank, const int blocksparse_local_blocks, + const c10::optional& alibi_slopes, + int quant_group, + torch::Tensor& k_scales, + torch::Tensor& v_scales, + const int tp_rank, const int blocksparse_local_blocks, const int blocksparse_vert_stride, const int blocksparse_block_size, const int blocksparse_head_sliding_step) { int num_seqs = query.size(0); @@ -82,6 +86,8 @@ void paged_attention_v2_launcher( T* query_ptr = reinterpret_cast(query.data_ptr()); CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); + float* k_scales_ptr = reinterpret_cast(k_scales.data_ptr()); + float* v_scales_ptr = reinterpret_cast(v_scales.data_ptr()); int* block_tables_ptr = block_tables.data_ptr(); int* seq_lens_ptr = seq_lens.data_ptr(); @@ -142,7 +148,8 @@ void paged_attention_v2_launcher( IS_BLOCK_SPARSE>( \ out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \ - k_scale, v_scale, tp_rank, blocksparse_local_blocks, \ + quant_group, k_scales, v_scales, \ + tp_rank, blocksparse_local_blocks, \ blocksparse_vert_stride, blocksparse_block_size, \ blocksparse_head_sliding_step); @@ -187,12 +194,16 @@ void paged_attention_v2( torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] torch::Tensor& seq_lens, // [num_seqs] int64_t block_size, int64_t max_seq_len, - const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, + const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, + const int64_t quant_group, + torch::Tensor& k_scales, + torch::Tensor& v_scales, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step) { const bool is_block_sparse = (blocksparse_vert_stride > 1); + DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, CALL_V2_LAUNCHER_BLOCK_SIZE) } @@ -200,4 +211,4 @@ void paged_attention_v2( #undef WARP_SIZE #undef MAX #undef MIN -#undef DIVIDE_ROUND_UP \ No newline at end of file +#undef DIVIDE_ROUND_UP diff --git a/csrc/cache.h b/csrc/cache.h index 11c4c5001daa..ac36d7f34ff9 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -18,16 +18,20 @@ void copy_blocks(std::vector const& key_caches, void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& slot_mapping, - const std::string& kv_cache_dtype, const double k_scale, - const double v_scale); + const std::string& kv_cache_dtype, + const int64_t quant_group, + torch::Tensor& k_scales, + torch::Tensor& v_scales); void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value, torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& slot_mapping, const std::string& kv_cache_dtype, - const double k_scale, const double v_scale); - + const int64_t quant_group, + torch::Tensor& k_scales, + torch::Tensor& v_scales); + // Just for unittest void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache, const double scale, const std::string& kv_cache_dtype); diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 8a95279f9a25..9afb693a9643 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -10,6 +10,7 @@ #else #include "quantization/fp8/nvidia/quant_utils.cuh" #endif +#include "quantization/int8_kvcache/quant_utils.cuh" #include #include @@ -159,20 +160,31 @@ __global__ void reshape_and_cache_kernel( // block_size] const int64_t* __restrict__ slot_mapping, // [num_tokens] const int key_stride, const int value_stride, const int num_heads, - const int head_size, const int block_size, const int x, const float k_scale, - const float v_scale) { + const int head_size, const int block_size, const int x, + const int quant_group, + const float* __restrict__ k_scales, + const float* __restrict__ v_scales) { const int64_t token_idx = blockIdx.x; const int64_t slot_idx = slot_mapping[token_idx]; if (slot_idx < 0) { // Padding token that should be ignored. return; } - const int64_t block_idx = slot_idx / block_size; const int64_t block_offset = slot_idx % block_size; const int n = num_heads * head_size; for (int i = threadIdx.x; i < n; i += blockDim.x) { + float k_scale = 0; + float v_scale = 0; + if constexpr (kv_dt == Fp8KVCacheDataType::kInt8Group128) { + int64_t tgt_kvs_idx = floor(i/quant_group); + k_scale = *reinterpret_cast(k_scales+tgt_kvs_idx); + v_scale = *reinterpret_cast(v_scales+tgt_kvs_idx); + } else { + k_scale = *reinterpret_cast(k_scales); + v_scale = *reinterpret_cast(v_scales); + } const int64_t src_key_idx = token_idx * key_stride + i; const int64_t src_value_idx = token_idx * value_stride + i; @@ -194,6 +206,25 @@ __global__ void reshape_and_cache_kernel( if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { key_cache[tgt_key_idx] = tgt_key; value_cache[tgt_value_idx] = tgt_value; + // int8 kv-cache + } else if constexpr (kv_dt == Fp8KVCacheDataType::kInt8Group0) { + key_cache[tgt_key_idx] = + int8::scaled_vec_conversion_int8(tgt_key, + k_scale, + 0); + value_cache[tgt_value_idx] = + int8::scaled_vec_conversion_int8(tgt_value, + v_scale, + 0); + } else if constexpr (kv_dt == Fp8KVCacheDataType::kInt8Group128) { + key_cache[tgt_key_idx] = + int8::scaled_vec_conversion_int8(tgt_key, + k_scale, + 0); + value_cache[tgt_value_idx] = + int8::scaled_vec_conversion_int8(tgt_value, + v_scale, + 0); } else { key_cache[tgt_key_idx] = fp8::scaled_convert(tgt_key, k_scale); @@ -214,7 +245,9 @@ __global__ void reshape_and_cache_flash_kernel( const int64_t* __restrict__ slot_mapping, // [num_tokens] const int block_stride, const int key_stride, const int value_stride, const int num_heads, const int head_size, const int block_size, - const float k_scale, const float v_scale) { + const int quant_group, + const float* __restrict__ k_scales, + const float* __restrict__ v_scales) { const int64_t token_idx = blockIdx.x; const int64_t slot_idx = slot_mapping[token_idx]; // NOTE: slot_idx can be -1 if the token is padded @@ -225,6 +258,16 @@ __global__ void reshape_and_cache_flash_kernel( const int64_t block_offset = slot_idx % block_size; const int n = num_heads * head_size; for (int i = threadIdx.x; i < n; i += blockDim.x) { + float k_scale = 0; + float v_scale = 0; + if constexpr (kv_dt == Fp8KVCacheDataType::kInt8Group128) { + int64_t tgt_kvs_idx = floor(i/quant_group); + k_scale = *reinterpret_cast(k_scales+tgt_kvs_idx); + v_scale = *reinterpret_cast(v_scales+tgt_kvs_idx); + } else { + k_scale = *reinterpret_cast(k_scales); + v_scale = *reinterpret_cast(v_scales); + } const int64_t src_key_idx = token_idx * key_stride + i; const int64_t src_value_idx = token_idx * value_stride + i; const int head_idx = i / head_size; @@ -237,6 +280,25 @@ __global__ void reshape_and_cache_flash_kernel( if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { key_cache[tgt_key_value_idx] = tgt_key; value_cache[tgt_key_value_idx] = tgt_value; + // int8 kv-cache + } else if constexpr (kv_dt == Fp8KVCacheDataType::kInt8Group0) { + key_cache[tgt_key_value_idx] = + int8::scaled_vec_conversion_int8(tgt_key, + k_scale, + 0); + value_cache[tgt_key_value_idx] = + int8::scaled_vec_conversion_int8(tgt_value, + v_scale, + 0); + } else if constexpr (kv_dt == Fp8KVCacheDataType::kInt8Group128) { + key_cache[tgt_key_value_idx] = + int8::scaled_vec_conversion_int8(tgt_key, + k_scale, + 0); + value_cache[tgt_key_value_idx] = + int8::scaled_vec_conversion_int8(tgt_value, + v_scale, + 0); } else { key_cache[tgt_key_value_idx] = fp8::scaled_convert(tgt_key, k_scale); @@ -258,7 +320,10 @@ __global__ void reshape_and_cache_flash_kernel( reinterpret_cast(key_cache.data_ptr()), \ reinterpret_cast(value_cache.data_ptr()), \ slot_mapping.data_ptr(), key_stride, value_stride, \ - num_heads, head_size, block_size, x, k_scale, v_scale); + num_heads, head_size, block_size, x, \ + quant_group, \ + k_scales.data_ptr(), \ + v_scales.data_ptr()); \ void reshape_and_cache( torch::Tensor& key, // [num_tokens, num_heads, head_size] @@ -268,8 +333,10 @@ void reshape_and_cache( torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size] torch::Tensor& slot_mapping, // [num_tokens] - const std::string& kv_cache_dtype, const double k_scale, - const double v_scale) { + const std::string& kv_cache_dtype, + const int64_t quant_group, + torch::Tensor& k_scales, + torch::Tensor& v_scales) { int num_tokens = key.size(0); int num_heads = key.size(1); int head_size = key.size(2); @@ -299,7 +366,9 @@ void reshape_and_cache( reinterpret_cast(key_cache.data_ptr()), \ reinterpret_cast(value_cache.data_ptr()), \ slot_mapping.data_ptr(), block_stride, key_stride, \ - value_stride, num_heads, head_size, block_size, k_scale, v_scale); + value_stride, num_heads, head_size, block_size, \ + quant_group, k_scales.data_ptr(), \ + v_scales.data_ptr()); void reshape_and_cache_flash( torch::Tensor& key, // [num_tokens, num_heads, head_size] @@ -308,8 +377,10 @@ void reshape_and_cache_flash( torch::Tensor& value_cache, // [num_blocks, block_size, num_heads, head_size] torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens] - const std::string& kv_cache_dtype, const double k_scale, - const double v_scale) { + const std::string& kv_cache_dtype, + const int64_t quant_group, + torch::Tensor& k_scales, + torch::Tensor& v_scales) { // NOTE(woosuk): In vLLM V1, key.size(0) can be different from // slot_mapping.size(0) because of padding for CUDA graphs. // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index 74e4d8189d40..534263365fb2 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -30,7 +30,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor value_cache, int num_kv_heads, float scale," " Tensor block_tables, Tensor seq_lens, int block_size," " int max_seq_len, Tensor? alibi_slopes," - " str kv_cache_dtype, float k_scale, float v_scale," + " str kv_cache_dtype, " + " int quant_group," + " Tensor k_scales, Tensor v_scales," " int tp_rank, int blocksparse_local_blocks," " int blocksparse_vert_stride, int blocksparse_block_size," " int blocksparse_head_sliding_step) -> ()"); @@ -44,7 +46,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor value_cache, int num_kv_heads, float scale," " Tensor block_tables, Tensor seq_lens, int block_size," " int max_seq_len, Tensor? alibi_slopes," - " str kv_cache_dtype, float k_scale, float v_scale," + " str kv_cache_dtype, " + " int quant_group," + " Tensor k_scales, Tensor v_scales," " int tp_rank, int blocksparse_local_blocks," " int blocksparse_vert_stride, int blocksparse_block_size," " int blocksparse_head_sliding_step) -> ()"); @@ -148,7 +152,9 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { " Tensor! key_cache, Tensor! value_cache," " Tensor slot_mapping," " str kv_cache_dtype," - " float k_scale, float v_scale) -> ()"); + " int quant_group," + " Tensor k_scales," + " Tensor v_scales) -> ()"); cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache); } diff --git a/csrc/ops.h b/csrc/ops.h index 5a194a0dd365..46f0202f10b8 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -33,8 +33,9 @@ void paged_attention_v1( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, + int64_t max_seq_len, const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, + const int64_t quant_group, torch::Tensor& k_scales, torch::Tensor& v_scales, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step); @@ -44,8 +45,9 @@ void paged_attention_v2( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, + int64_t max_seq_len, const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, + const int64_t quant_group, torch::Tensor& k_scales, torch::Tensor& v_scales, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step); diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh index eb66834222f3..4df81e33190c 100644 --- a/csrc/quantization/fp8/amd/quant_utils.cuh +++ b/csrc/quantization/fp8/amd/quant_utils.cuh @@ -567,6 +567,28 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) { TORCH_CHECK(false, \ "Unsupported input type of kv cache: ", SRC_DTYPE); \ } \ + } else if (KV_DTYPE == "int8_group0") { \ + if (SRC_DTYPE == at::ScalarType::Float) { \ + FN(float, uint8_t, vllm::Fp8KVCacheDataType::kInt8Group0); \ + } else if (SRC_DTYPE == at::ScalarType::Half) { \ + FN(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kInt8Group0); \ + } else if (SRC_DTYPE == at::ScalarType::BFloat16) { \ + FN(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kInt8Group0); \ + } else { \ + TORCH_CHECK(false, \ + "Unsupported input type of kv cache: ", SRC_DTYPE); \ + } \ + } else if (KV_DTYPE == "int8_group128") { \ + if (SRC_DTYPE == at::ScalarType::Float) { \ + FN(float, uint8_t, vllm::Fp8KVCacheDataType::kInt8Group128); \ + } else if (SRC_DTYPE == at::ScalarType::Half) { \ + FN(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kInt8Group128); \ + } else if (SRC_DTYPE == at::ScalarType::BFloat16) { \ + FN(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kInt8Group128); \ + } else { \ + TORCH_CHECK(false, \ + "Unsupported input type of kv cache: ", SRC_DTYPE); \ + } \ } else { \ TORCH_CHECK(false, "Unsupported data type of kv cache: ", KV_DTYPE); \ } \ diff --git a/csrc/quantization/fp8/nvidia/quant_utils.cuh b/csrc/quantization/fp8/nvidia/quant_utils.cuh index f8cd1dcba4ab..92b0fb04671d 100644 --- a/csrc/quantization/fp8/nvidia/quant_utils.cuh +++ b/csrc/quantization/fp8/nvidia/quant_utils.cuh @@ -563,6 +563,28 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) { TORCH_CHECK(false, \ "Unsupported input type of kv cache: ", SRC_DTYPE); \ } \ + } else if (KV_DTYPE == "int8_group0") { \ + if (SRC_DTYPE == at::ScalarType::Float) { \ + FN(float, uint8_t, vllm::Fp8KVCacheDataType::kInt8Group0); \ + } else if (SRC_DTYPE == at::ScalarType::Half) { \ + FN(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kInt8Group0); \ + } else if (SRC_DTYPE == at::ScalarType::BFloat16) { \ + FN(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kInt8Group0); \ + } else { \ + TORCH_CHECK(false, \ + "Unsupported input type of kv cache: ", SRC_DTYPE); \ + } \ + } else if (KV_DTYPE == "int8_group128") { \ + if (SRC_DTYPE == at::ScalarType::Float) { \ + FN(float, uint8_t, vllm::Fp8KVCacheDataType::kInt8Group128); \ + } else if (SRC_DTYPE == at::ScalarType::Half) { \ + FN(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kInt8Group128); \ + } else if (SRC_DTYPE == at::ScalarType::BFloat16) { \ + FN(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kInt8Group128); \ + } else { \ + TORCH_CHECK(false, \ + "Unsupported input type of kv cache: ", SRC_DTYPE); \ + } \ } else { \ TORCH_CHECK(false, "Unsupported data type of kv cache: ", KV_DTYPE); \ } \ diff --git a/csrc/quantization/int8_kvcache/quant_utils.cuh b/csrc/quantization/int8_kvcache/quant_utils.cuh new file mode 100644 index 000000000000..a180008b93e2 --- /dev/null +++ b/csrc/quantization/int8_kvcache/quant_utils.cuh @@ -0,0 +1,231 @@ +// Adated from FasterTransformer, https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp +#pragma once + +#include +#include +#include +#include +#include "../../attention/attention_dtypes.h" +#include + +namespace vllm { +namespace int8 { + +// KV-CACHE int8 +static inline __device__ float int8_to_float(uint8_t x, const float scale, const float zero_point) { + int8_t a = x - 128; + float res = a * scale + zero_point; + // printf("\n dequant scale= %f, zero_point= %f \n", scale, zero_point); + // if(abs(res+1.268555)<=0.01) + // printf("\nI am here int8_to_float, x = %d, a= %d, res=%f, scale=%f, zero_point=%f \n", + // x, a, res, scale, zero_point); + return res; +} + +static inline __device__ uint8_t float_to_int8(float x, const float scale, const float zero_point) { + int8_t fx = roundf(max(-128.f, min(127.f, (x-zero_point) / scale))); + uint8_t res = fx + 128; + // printf("\n quant scale= %f \n", scale); + // if(abs(x+1.268555)<=0.00001) + // printf("\nI am here float_to_int8, x = %f, fx= %d, res=%d, scale=%f, zero_point=%f, (x-zero_point) / scale)=%f \n", + // x, fx, res, scale, zero_point, (x-zero_point) / scale); + return res; +} + +template +__inline__ __device__ Tout scaled_vec_conversion_int8(const Tin& x, + const float scale, const float zero_point) { + return x; +} + +// int8 -> half +template <> +__inline__ __device__ uint16_t scaled_vec_conversion_int8( + const uint8_t& a, const float scale, const float zero_point) { + float res = int8_to_float(a, scale, zero_point); + return float_to_half(res); +} + +// int8x2 -> half2 +template <> +__inline__ __device__ uint32_t scaled_vec_conversion_int8( + const uint16_t& a, const float scale, const float zero_point) { + union { + uint16_t u16[2]; + uint32_t u32; + } res; + res.u16[0] = scaled_vec_conversion_int8((uint8_t)a, scale, zero_point); + res.u16[1] = + scaled_vec_conversion_int8((uint8_t)(a >> 8U), scale, zero_point); + + // union { + // uint8_t int8[2]; + // uint16_t int16; + // } tmp; + // tmp.int16 = a; + // res.u16[0] = float_to_half(int8_to_float(tmp.int8[0], scale, zero_point)); + // res.u16[1] = float_to_half(int8_to_float(tmp.int8[0], scale, zero_point)); + return res.u32; +} + +// int8x4 -> half2x2 +template <> +__inline__ __device__ uint2 scaled_vec_conversion_int8( + const uint32_t& a, const float scale, const float zero_point) { + union { + uint2 u32x2; + uint32_t u32[2]; + } tmp; + tmp.u32[0] = + scaled_vec_conversion_int8((uint16_t)a, scale, zero_point); + tmp.u32[1] = scaled_vec_conversion_int8( + (uint16_t)(a >> 16U), scale, zero_point); + return tmp.u32x2; +} + +// int8x8 -> half2x4 +template <> +__inline__ __device__ uint4 +scaled_vec_conversion_int8(const uint2& a, const float scale, const float zero_point) { + union { + uint4 u64x2; + uint2 u64[2]; + } tmp; + tmp.u64[0] = scaled_vec_conversion_int8(a.x, scale, zero_point); + tmp.u64[1] = scaled_vec_conversion_int8(a.y, scale, zero_point); + return tmp.u64x2; +} + +// int8 -> __nv_bfloat16 +template <> +__inline__ __device__ __nv_bfloat16 +scaled_vec_conversion_int8<__nv_bfloat16, uint8_t>(const uint8_t& a, + const float scale, const float zero_point) { + // Note there is no direct convert function from int8 to bf16. + float res = int8_to_float(a, scale, zero_point); + return __float2bfloat16(res); +} + +// int8x2 -> __nv_bfloat162 +template <> +__inline__ __device__ __nv_bfloat162 +scaled_vec_conversion_int8<__nv_bfloat162, uint16_t>(const uint16_t& a, + const float scale, const float zero_point) { + __nv_bfloat162 res; + res.x = scaled_vec_conversion_int8<__nv_bfloat16, uint8_t>((uint8_t)a, scale, zero_point); + res.y = scaled_vec_conversion_int8<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U), + scale, zero_point); + return res; +} + +// int8x4 -> bf16_4_t +template <> +__inline__ __device__ bf16_4_t scaled_vec_conversion_int8( + const uint32_t& a, const float scale, const float zero_point) { + bf16_4_t res; + res.x = + scaled_vec_conversion_int8<__nv_bfloat162, uint16_t>((uint16_t)a, scale, zero_point); + res.y = scaled_vec_conversion_int8<__nv_bfloat162, uint16_t>( + (uint16_t)(a >> 16U), scale, zero_point); + return res; +} + +// int8x8 -> bf16_8_t +template <> +__inline__ __device__ bf16_8_t +scaled_vec_conversion_int8(const uint2& a, const float scale, const float zero_point) { + bf16_4_t tmp1, tmp2; + tmp1 = scaled_vec_conversion_int8(a.x, scale, zero_point); + tmp2 = scaled_vec_conversion_int8(a.y, scale, zero_point); + bf16_8_t res; + res.x = tmp1.x; + res.y = tmp1.y; + res.z = tmp2.x; + res.w = tmp2.y; + return res; +} + +// int8 -> float +template <> +__inline__ __device__ float scaled_vec_conversion_int8( + const uint8_t& a, const float scale, const float zero_point) { + float res = int8_to_float(a, scale, zero_point); + return res; +} + +// int8x2 -> float2 +template <> +__inline__ __device__ float2 scaled_vec_conversion_int8( + const uint16_t& a, const float scale, const float zero_point) { + // int8x2 -> half2 + uint32_t tmp = scaled_vec_conversion_int8(a, scale, zero_point); + // half2 -> float2 + return half2_to_float2(tmp); +} + +// int8x4 -> float4 +template <> +__inline__ __device__ Float4_ scaled_vec_conversion_int8( + const uint32_t& a, const float scale, const float zero_point) { + Float4_ res; + res.x = scaled_vec_conversion_int8((uint16_t)a, scale, zero_point); + res.y = + scaled_vec_conversion_int8((uint16_t)(a >> 16U), scale, zero_point); + return res; +} + +// int8x8 -> float8 +template <> +__inline__ __device__ Float8_ +scaled_vec_conversion_int8(const uint2& a, const float scale, const float zero_point) { + Float4_ tmp1, tmp2; + tmp1 = scaled_vec_conversion_int8(a.x, scale, zero_point); + tmp2 = scaled_vec_conversion_int8(a.y, scale, zero_point); + Float8_ res; + res.x = tmp1.x; + res.y = tmp1.y; + res.z = tmp2.x; + res.w = tmp2.y; + return res; +} + +// half -> int8 +template <> +__inline__ __device__ uint8_t scaled_vec_conversion_int8( + const uint16_t& a, const float scale, const float zero_point) { + uint8_t res = float_to_int8(half_to_float(a), scale, zero_point); + // int8_t u8data = static_cast(round(half_to_float(a)*255)); + // if(a==48403) + // printf("\nI am here scaled_vec_conversion half fp8, a = %d, half_to_float(a) = %f, res= %d, a'=%f, a-a' = %f \n", + // a, half_to_float(a), (uint8_t)res, scaled_vec_conversion_int8(res, scale, zero_point), (half_to_float(a)-scaled_vec_conversion_int8(res, scale, zero_point))); + return (uint8_t)res; +} + +// bf16 -> int8 +template <> +__inline__ __device__ uint8_t +scaled_vec_conversion_int8(const __nv_bfloat16& a, + const float scale, const float zero_point) { + uint8_t res = float_to_int8(__bfloat162float(a), scale, zero_point); + return (uint8_t)res; +} + +// float -> int8 +template <> +__inline__ __device__ uint8_t +scaled_vec_conversion_int8(const float& a, const float scale, const float zero_point) { + uint8_t res = float_to_int8(a, scale, zero_point); + return (uint8_t)res; +} + +// int8x4 -> float4 +template <> +__inline__ __device__ float4 scaled_vec_conversion_int8( + const uint32_t& a, const float scale, const float zero_point) { + Float4_ tmp = scaled_vec_conversion_int8(a, scale, zero_point); + float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y); + return res; +} + +} // namespace int8 +} // namespace vllm diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index fb53d122487d..36a894560b63 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -30,7 +30,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor value_cache, int num_kv_heads, float scale," " Tensor block_tables, Tensor seq_lens, int block_size," " int max_seq_len, Tensor? alibi_slopes," - " str kv_cache_dtype, float k_scale, float v_scale," + " str kv_cache_dtype, " + " int quant_group," + " Tensor k_scales, " + " Tensor v_scales, " " int tp_rank, int blocksparse_local_blocks," " int blocksparse_vert_stride, int blocksparse_block_size," " int blocksparse_head_sliding_step) -> ()"); @@ -44,7 +47,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor value_cache, int num_kv_heads, float scale," " Tensor block_tables, Tensor seq_lens, int block_size," " int max_seq_len, Tensor? alibi_slopes," - " str kv_cache_dtype, float k_scale, float v_scale," + " str kv_cache_dtype, " + " int quant_group," + " Tensor k_scales, " + " Tensor v_scales, " " int tp_rank, int blocksparse_local_blocks," " int blocksparse_vert_stride, int blocksparse_block_size," " int blocksparse_head_sliding_step) -> ()"); @@ -449,7 +455,9 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { " Tensor! key_cache, Tensor! value_cache," " Tensor slot_mapping," " str kv_cache_dtype," - " float k_scale, float v_scale) -> ()"); + " int quant_group," + " Tensor k_scales, " + " Tensor v_scales) -> ()"); cache_ops.impl("reshape_and_cache", torch::kCUDA, &reshape_and_cache); // Reshape the key and value tensors and cache them. @@ -459,7 +467,9 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { " Tensor! value_cache," " Tensor slot_mapping," " str kv_cache_dtype," - " float k_scale, float v_scale) -> ()"); + " int quant_group," + " Tensor k_scales, " + " Tensor v_scales) -> ()"); cache_ops.impl("reshape_and_cache_flash", torch::kCUDA, &reshape_and_cache_flash); diff --git a/examples/int8/calib_dataloader.py b/examples/int8/calib_dataloader.py new file mode 100755 index 000000000000..a8d40399b722 --- /dev/null +++ b/examples/int8/calib_dataloader.py @@ -0,0 +1,475 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import numpy as np +import torch + + +def set_seed(seed): + np.random.seed(seed) + torch.random.manual_seed(seed) + + +def get_wikitext2(tokenizer, nsamples, seed, seqlen, path=None): + """Load Wikitext-2 train and test datasets and tokenize. + + Args: + tokenizer: Tokenizer to encode text. + nsamples: Number of samples to take from train set. + seed: Random seed for sampling. + seqlen: Maximum sequence length. + + Returns: + train_loader: List of sampled and tokenized training examples. + test_enc: Full tokenized Wikitext-2 test set. + """ + from datasets import load_dataset + traindata = load_dataset(path if path else 'wikitext', + 'wikitext-2-raw-v1', + split='train') + testdata = load_dataset(path if path else 'wikitext', + 'wikitext-2-raw-v1', + split='test') + + trainenc = tokenizer('\n\n'.join(traindata['text']), return_tensors='pt') + testenc = tokenizer('\n\n'.join(testdata['text']), return_tensors='pt') + + import random + random.seed(seed) + trainloader = [] + for _ in range(nsamples): + i = random.randint(0, trainenc.input_ids.shape[1] - seqlen) + j = i + seqlen + inp = trainenc.input_ids[:, i:j] + tar = inp.clone() + tar[:, :-1] = -100 + trainloader.append((inp, tar)) + return trainloader, testenc + + +def get_ptb(tokenizer, nsamples, seed, seqlen): + """Load PTB train and validation datasets and tokenize. + + Args: + tokenizer: Tokenizer to encode text. + nsamples: Number of samples to take from train set. + seed: Random seed for sampling. + seqlen: Maximum sequence length. + + Returns: + train_loader: List of sampled and tokenized training examples. + test_enc: Full tokenized PTB validation set. + """ + from datasets import load_dataset + traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train') + valdata = load_dataset('ptb_text_only', + 'penn_treebank', + split='validation') + + trainenc = tokenizer('\n\n'.join(traindata['sentence']), + return_tensors='pt') + testenc = tokenizer('\n\n'.join(valdata['sentence']), return_tensors='pt') + + import random + random.seed(seed) + trainloader = [] + for _ in range(nsamples): + print("traindata ", trainenc.input_ids.shape) + print("seqlen ", seqlen) + i = random.randint(0, trainenc.input_ids.shape[1] - seqlen) + j = i + seqlen + inp = trainenc.input_ids[:, i:j] + tar = inp.clone() + tar[:, :-1] = -100 + trainloader.append((inp, tar)) + return trainloader, testenc + + +def get_c4(tokenizer, nsamples, seed, seqlen, path=None): + """Load C4 train and validation datasets and tokenize. + + Args: + tokenizer: Tokenizer to encode text. + nsamples: Number of samples to take from train set. + seed: Random seed for sampling. + seqlen: Maximum sequence length. + + Returns: + train_loader: List of sampled and tokenized training examples. + test_enc: Full tokenized PTB validation set. + """ + from datasets import load_dataset + traindata = load_dataset( + path if path else 'allenai/c4', + 'allenai--c4', + data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, + split='train', + use_auth_token=False) + valdata = load_dataset( + path if path else 'allenai/c4', + 'allenai--c4', + data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, + split='validation', + use_auth_token=False) + + import random + random.seed(seed) + trainloader = [] + for _ in range(nsamples): + while True: + i = random.randint(0, len(traindata) - 1) + trainenc = tokenizer(traindata[i]['text'], return_tensors='pt') + if trainenc.input_ids.shape[1] >= seqlen: + break + i = random.randint(0, trainenc.input_ids.shape[1] - seqlen) + j = i + seqlen + inp = trainenc.input_ids[:, i:j] + tar = inp.clone() + tar[:, :-1] = -100 + trainloader.append((inp, tar)) + + valenc = [] + for _ in range(256): + while True: + i = random.randint(0, len(valdata) - 1) + tmp = tokenizer(valdata[i]['text'], return_tensors='pt') + if tmp.input_ids.shape[1] >= seqlen: + break + i = random.randint(0, tmp.input_ids.shape[1] - seqlen) + j = i + seqlen + valenc.append(tmp.input_ids[:, i:j]) + valenc = torch.hstack(valenc) + + class TokenizerWrapper: + + def __init__(self, input_ids): + self.input_ids = input_ids + + valenc = TokenizerWrapper(valenc) + + return trainloader, valenc + + +def get_ptb_new(tokenizer, nsamples, seed, seqlen): + """Load PTB New train and validation datasets and tokenize. + + Args: + tokenizer: Tokenizer to encode text. + nsamples: Number of samples to take from train set. + seed: Random seed for sampling. + seqlen: Maximum sequence length. + + Returns: + train_loader: List of sampled and tokenized training examples. + test_enc: Full tokenized PTB validation set. + """ + from datasets import load_dataset + traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train') + testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test') + + trainenc = tokenizer(' '.join(traindata['sentence']), return_tensors='pt') + testenc = tokenizer(' '.join(testdata['sentence']), return_tensors='pt') + + import random + random.seed(seed) + trainloader = [] + for _ in range(nsamples): + i = random.randint(0, trainenc.input_ids.shape[1] - seqlen) + j = i + seqlen + inp = trainenc.input_ids[:, i:j] + tar = inp.clone() + tar[:, :-1] = -100 + trainloader.append((inp, tar)) + return trainloader, testenc + + +def get_c4_new(tokenizer, nsamples, seed, seqlen): + """Load C4 New train and validation datasets and tokenize. + + Args: + tokenizer: Tokenizer to encode text. + nsamples: Number of samples to take from train set. + seed: Random seed for sampling. + seqlen: Maximum sequence length. + + Returns: + train_loader: List of sampled and tokenized training examples. + test_enc: Full tokenized PTB validation set. + """ + from datasets import load_dataset + traindata = load_dataset( + 'allenai/c4', + 'allenai--c4', + data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, + split='train') + valdata = load_dataset( + 'allenai/c4', + 'allenai--c4', + data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, + split='validation') + + import random + random.seed(seed) + trainloader = [] + for _ in range(nsamples): + while True: + i = random.randint(0, len(traindata) - 1) + trainenc = tokenizer(traindata[i]['text'], return_tensors='pt') + if trainenc.input_ids.shape[1] >= seqlen: + break + i = random.randint(0, trainenc.input_ids.shape[1] - seqlen) + j = i + seqlen + inp = trainenc.input_ids[:, i:j] + tar = inp.clone() + tar[:, :-1] = -100 + trainloader.append((inp, tar)) + + valenc = tokenizer(' '.join(valdata[:1100]['text']), return_tensors='pt') + valenc = valenc.input_ids[:, :(256 * seqlen)] + + class TokenizerWrapper: + + def __init__(self, input_ids): + self.input_ids = input_ids + + valenc = TokenizerWrapper(valenc) + + return trainloader, valenc + + +def get_pileval(tokenizer, nsamples, seed, path, seqlen=512): + """Load pileval train dataset and tokenize. + + Args: + tokenizer: Tokenizer to encode text. + nsamples: Number of samples to take from train set. + seed: Random seed for sampling. + seqlen: Maximum sequence length. + + Returns: + train_loader: List of sampled and tokenized training examples. + test_enc: Full tokenized PTB validation set. + """ + from datasets import load_dataset + from datasets.builder import DatasetGenerationError + try: + dataset = load_dataset('json', data_files=path, split='train') + except DatasetGenerationError as err: + raise InterruptedError('There have been some issues when generating ' + 'the dataset, you could try to download it ' + 'locally first, and replace the `data_files`' + 'with local addresses or use other datasets ' + '(c4, wiki, ptb).') from err + dataset = dataset.shuffle(seed=seed) + samples = [] + n_run = 0 + for data in dataset: + line = data['text'] + line = line.strip() + line_encoded = tokenizer.encode(line) + if len(line_encoded) > 512: + continue + sample = torch.tensor([line_encoded]) + if sample.numel() == 0: + continue + samples.append(sample) + n_run += 1 + if n_run == nsamples: + break + # now concatenate all samples and split according to block size + cat_samples = torch.cat(samples, dim=1) + n_split = cat_samples.shape[1] // seqlen + print(f' * Split into {n_split} blocks') + return [ + cat_samples[:, i * seqlen:(i + 1) * seqlen] for i in range(n_split) + ], None + +# llamafactory datasets +def get_lf_datasets(tokenizer, nsamples, seed, seqlen, path_to_eval, split_name): + from datasets import load_dataset + from typing import Dict + from tqdm import tqdm, trange + from template import get_eval_template + from transformers.utils import cached_file + CHOICES = ["A", "B", "C", "D"] + SUBJECTS = ["Average", "STEM", "Social Sciences", "Humanities", "Other"] + + mapping = cached_file( + path_or_repo_id=path_to_eval, + filename="mapping.json", + ) + with open(mapping, "r", encoding="utf-8") as f: + categorys: Dict[str, Dict[str, str]] = json.load(f) + category_corrects = {subj: np.array([], dtype="bool") for subj in SUBJECTS} + pbar = tqdm(categorys.keys(), desc="Processing subjects", position=0) + trainloader = [] + inputs, labels = [], [] + for subject in pbar: + dataset = load_dataset( + path=path_to_eval, + name=subject, + # split='train', + trust_remote_code=True, + ) + traindata = dataset[split_name] + pbar.set_postfix_str(categorys[subject]["name"]) + for i in trange(len(traindata), desc="Formatting batches", position=1, leave=False): + # print("loop i ", i ) + support_set = ( + dataset["train"].shuffle().select(range(min(nsamples, len(dataset["train"])))) + ) + messages = get_eval_template('zh').format_example( + target_data=traindata[i], + support_set=support_set, + subject_name=categorys[subject]["name"], + ) + messages[-2]["content"] = '"'+ messages[-2]["content"]+'"' + # print("**** messages[-2][content] ",messages[-2]) + # print("**** messages[-2][content] ",messages[-2]["content"]) + inputs.append(messages[-2]["content"]) + labels.append(messages[-1]["content"]) + # print(labels) + trainenc = tokenizer('\n\n'.join(inputs), + return_tensors='pt') + # testenc = tokenizer('\n\n'.join(valdata['sentence']), + # return_tensors='pt') + import random + random.seed(seed) + # for _ in range(min(nsamples, len(inputs))): + # # print("seqlen ", seqlen) + # # print("traindata ", trainenc.input_ids.shape) + # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen) + # j = i + seqlen + # inp = trainenc.input_ids[:, i:j] + # tar = inp.clone() + # tar[:, :-1] = -100 + # trainloader.append((inp, tar)) + max_length = trainenc.input_ids.shape[1] + print("n_requests ", len(inputs)) + print("max_length ", max_length) + for n in range(max_length): + # print("seqlen ", seqlen) + # print("traindata ", trainenc.input_ids.shape) + i = n*seqlen + j = i + seqlen + if j=max_length or len(trainloader)>nsamples: + break + trainloader.append((inp, tar)) + return trainloader, None + +# ceval_val_cmcc.jsonl +def get_ceval_val_cmcc(tokenizer, nsamples, seed, seqlen, path_to_eval): + path_to_eval = path_to_eval+'ceval_val_cmcc.jsonl' + trainloader = [] + inputs=[] + with open(path_to_eval, 'r') as jsonl_file: + for line in jsonl_file: + json_object = json.loads(line) + inputs.append(json_object["origin_prompt"]) + + # inputs=["Please introduce particle physics."] + trainenc = tokenizer('\n\n'.join(inputs), + return_tensors='pt') + + import random + random.seed(seed) + # print(trainenc) + # for _ in range(min(nsamples, len(inputs))): + # # print("seqlen ", seqlen) + # print("traindata ", trainenc.input_ids.shape) + # i = random.randint(0, trainenc.input_ids.shape[1] - seqlen) + # j = i + seqlen + # inp = trainenc.input_ids[:, i:j] + # tar = inp.clone() + # tar[:, :-1] = -100 + # print("i ",i, " j ",j, " inp ", inp) + # trainloader.append((inp, tar)) + + max_length = trainenc.input_ids.shape[1] + print("n_requests ", len(inputs)) + print("max_length ", max_length) + for n in range(max_length): + # print("seqlen ", seqlen) + # print("traindata ", trainenc.input_ids.shape) + i = n*seqlen + j = i + seqlen + if j=max_length: + break + return trainloader, None + +def get_calib_loaders(name, + tokenizer, + nsamples=128, + seed=0, + seqlen=2048, + path=None): + """Get calibration data loaders for a dataset. + + Args: + name: Dataset name ('wikitext2', 'ptb', 'c4', etc). + tokenizer: Tokenizer to encode text. + nsamples: Number of samples to take from train set. + seed: Random seed for sampling. + seqlen: Maximum sequence length. + + Returns: + train_loader: List of sampled and tokenized training examples. + test_data: Full tokenized validation set. + """ + if 'wikitext2' in name: + return get_wikitext2(tokenizer, nsamples, seed, seqlen, path) + if 'ptb' in name: + if 'new' in name: + return get_ptb_new(tokenizer, nsamples, seed, seqlen) + return get_ptb(tokenizer, nsamples, seed, seqlen) + if 'c4' in name: + if 'new' in name: + return get_c4_new(tokenizer, nsamples, seed, seqlen) + return get_c4(tokenizer, nsamples, seed, seqlen, path) + + if 'pileval' in name: + if path is None: + path = 'https://the-eye.eu/public/AI/pile/val.jsonl.zst' + return get_pileval(tokenizer, nsamples, seed, path, seqlen) + + if 'pileval' in name: + if path is None: + path = 'https://the-eye.eu/public/AI/pile/val.jsonl.zst' + return get_pileval(tokenizer, nsamples, seed, path, seqlen) + + if 'ceval_val_cmcc' in name: + return get_ceval_val_cmcc(tokenizer, nsamples, seed, seqlen, path) + if 'ceval' or 'cmb' or 'cmmlu' or 'medmcqa' or 'medqa' or 'mmlu' in name: + if name == 'ceval_val_cmcc': + pass + split_name = 'test' + if name == 'ceval': + split_name = 'test' + elif name == 'cmb': + split_name = 'test' + elif name == 'medmcqa': + split_name = 'test' + elif name == 'medqa': + split_name = 'test' + elif name == 'mmlu': + split_name = 'test' + + return get_lf_datasets(tokenizer, nsamples, seed, seqlen, path, split_name) + diff --git a/examples/int8/calibrate.py b/examples/int8/calibrate.py new file mode 100755 index 000000000000..e4b7ad8f871d --- /dev/null +++ b/examples/int8/calibrate.py @@ -0,0 +1,122 @@ +# coding=utf-8 +# Adapted from +# https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/lite/apis/calibrate.py + +# Copyright (c) OpenMMLab. All rights reserved. + +from pathlib import Path + +import fire +import torch +from accelerate import (infer_auto_device_map, init_empty_weights, + load_checkpoint_in_model) +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + +from calib_dataloader import get_calib_loaders +from calibration import CalibrationContext +from utils import collect_target_modules + +LAYER_TYPE_MAP = { + 'InternLMForCausalLM': 'InternLMDecoderLayer', + 'QWenLMHeadModel': 'QWenBlock', + 'BaiChuanForCausalLM': 'DecoderLayer', + 'LlamaForCausalLM': 'LlamaDecoderLayer', +} +NORM_TYPE_MAP = { + 'InternLMForCausalLM': 'InternLMRMSNorm', + 'QWenLMHeadModel': 'RMSNorm', + 'BaiChuanForCausalLM': 'RMSNorm', + 'LlamaForCausalLM': 'LlamaRMSNorm', +} + + +def calibrate(model: str = '/home/model_weights/Llama3-Chinese-8B-Instruct/', + calib_dataset: str = 'ptb', + dataset_path: str = None, + work_dir: str = './work_dir', + calib_samples: int = 128, + calib_seqlen: int = 2048, + device: str = 'cuda', + ) -> None: + """The main function for loading the model and performing calibration on a + given dataset. + + Args: + model (str): The model to be loaded. + calib_dataset (str, optional): The calibration dataset name. + Defaults to 'ptb'. + calib_samples (int, optional): The number of samples for calibration. + Defaults to 128. + calib_seqlen (int, optional): The sequence length for calibration. + Defaults to 2048. + work_dir (str): The working directory for outputs. + Defaults to './work_dir'. + device (str, optional): The device to be used for calculation. + Defaults to 'cuda'. + """ + # ceval_val_cmcc.jsonl + + assert calib_dataset in ['c4', 'ptb', 'wikitext2', 'pileval', 'ceval_val_cmcc', + 'ceval', 'cmmlu', 'cmb', 'medmcqa', 'medqa', 'mmlu'], \ + 'Support only `c4`, `ptb`, `wikitext2` or `pileval`, \ + `ceval_val_cmcc`, `ceval`, `cmmlu`, `cmb`, `medmcqa`,\ + `medqa`, `mmlu`' + + # Load tokenizer and configuration + tokenizer = AutoTokenizer.from_pretrained(model, + use_fast=False, + trust_remote_code=True) + hf_config = AutoConfig.from_pretrained(model, trust_remote_code=True) + checkpoint = hf_config._name_or_path + + with init_empty_weights(): + # Load model + model = AutoModelForCausalLM.from_pretrained(model, + torch_dtype=torch.float16, + trust_remote_code=True) + model.config.use_cache = False + + layer_type = LAYER_TYPE_MAP[type(model).__name__] + norm_type = NORM_TYPE_MAP[type(model).__name__] + + decoder_layers = collect_target_modules(model, layer_type) + + # Infer device map + device_map = infer_auto_device_map(model, + no_split_module_classes=[layer_type]) + for name in device_map: + if name in decoder_layers or 'lm_head' in name: + device_map[name] = 'cpu' + else: + device_map[name] = 0 + load_checkpoint_in_model(model, checkpoint, device_map) + + print('Loading calibrate dataset ...') + calib_loader, _ = get_calib_loaders(calib_dataset, + tokenizer, + nsamples=calib_samples, + seqlen=calib_seqlen, + path=dataset_path) + + # Initialize calibration context + calib_ctx = CalibrationContext(model, + tokenizer, + layer_type=layer_type, + norm_type=norm_type, + device=device) + + with calib_ctx: + all_data = torch.cat([ + data if isinstance(data, torch.Tensor) else data[0] + for data in calib_loader + ]).to(device) + calib_ctx.calibrate(all_data) + + # Create work directory if not exists + work_dir = Path(work_dir) + work_dir.mkdir(parents=True, exist_ok=True) + calib_ctx.export(work_dir) + + +if __name__ == '__main__': + fire.Fire(calibrate) diff --git a/examples/int8/calibration.py b/examples/int8/calibration.py new file mode 100755 index 000000000000..bda2aa9b1074 --- /dev/null +++ b/examples/int8/calibration.py @@ -0,0 +1,333 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from functools import partial +from typing import Union + +import torch +import transformers +from pkg_resources import parse_version +from torch import nn +from transformers import PreTrainedTokenizer + +from observer import ActivationObserver, KVCacheObserver +from utils import (bimap_name_mod, collect_target_modules, + concat_decoder_layer_outputs, + split_decoder_layer_inputs) + + +class CalibrationContext(): + """Calibration context manager for model quantization. + + Parameters: + - model: The target model to be calibrated and quantized + - tokenizer: The tokenizer used in the model training + - layer_type: Layer type to be targeted for calibration + - norm_type: Normalization type used for calibration + - device: Device on which model is to be calibrated ('cpu' or 'cuda') + """ + + inp_obs_group = 'inputs' + out_obs_group = 'outputs' + key_obs_group = 'keys' + value_obs_group = 'values' + + def __init__(self, + model: nn.Module, + tokenizer: PreTrainedTokenizer, + layer_type: Union[str, type], + norm_type: Union[str, type], + device: str = 'cuda') -> None: + """Initiate calibration context. + + Args: + model (nn.Module): Model to be calibrated. + tokenizer (PreTrainedTokenizer): Tokenizer of the given model. + layer_type (Union[str, type]): Type of the layers to be observed. + norm_type (Union[str, type]): Norm type used in the model. + device (str, optional): Device where the model should run. + Defaults to 'cuda'. + """ + + self.layer_type = layer_type + self.norm_type = norm_type + + num_kv_heads, num_attn_heads = self._guess_num_heads(model) + self.num_kv_heads = num_kv_heads + self.head_dim = model.config.hidden_size // num_attn_heads + self.model = model + del self.model.lm_head + + self.tokenizer = tokenizer + + # Collect modules to observe + self.name2layer = collect_target_modules(self.model, layer_type) + self.name2fc = {} + for l_name, layer in self.name2layer.items(): + name2fc = collect_target_modules(layer, nn.Linear, prefix=l_name) + self.name2fc.update(name2fc) + self.name2norm = collect_target_modules(self.model, norm_type) + + maps = bimap_name_mod([self.name2layer, self.name2fc, self.name2norm]) + self.name2mod, self.mod2name = maps + + # Initialize observers + self._init_input_observers(self.name2fc) + self._init_output_observers(self.name2norm) + self._init_output_observers(self.name2fc) + self._init_kv_observers(self.name2layer) + + self.device = device + + def _guess_num_heads(self, model): + + if hasattr(model.config, 'num_key_value_heads'): + num_kv_heads = model.config.num_key_value_heads + else: + num_kv_heads = model.config.num_attention_heads + + num_attn_heads = model.config.num_attention_heads + + return num_kv_heads, num_attn_heads + + def _init_input_observers(self, name2mod): + """Initialize input observers for given modules.""" + for name, mod in name2mod.items(): + obs = ActivationObserver(mod.weight.size(-1)) + obs.global_available(name, group=self.inp_obs_group) + + def _init_output_observers(self, name2mod): + """Initialize output observers for given modules.""" + for name, mod in name2mod.items(): + obs = ActivationObserver(mod.weight.size(0)) + obs.global_available(name, group=self.out_obs_group) + + def _init_kv_observers(self, name2mod): + """Initialize KV observers for given modules.""" + for name in name2mod: + k_obs = KVCacheObserver(self.num_kv_heads, self.head_dim) + v_obs = KVCacheObserver(self.num_kv_heads, self.head_dim) + k_obs.global_available(name, group=self.key_obs_group) + v_obs.global_available(name, group=self.value_obs_group) + + def _insert_input_observers(self): + """Insert input observers into the target modules. + + This function registers a forward pre-hook on each target module to + observe the inputs. + """ + + def _input_hook(mod: nn.Module, inp: torch.Tensor): + m_name = self.mod2name[mod] + obs = ActivationObserver.find(m_name, group=self.inp_obs_group) + obs.observe(inp[0]) + + group = ActivationObserver.find_group(self.inp_obs_group) + for name in group: + mod = self.name2mod[name] + hook_fn = mod.register_forward_pre_hook(_input_hook) + self._hooks.append(hook_fn) + + def _insert_output_observers(self): + """Insert output observers into the target modules. + + This function registers a forward hook on each target module to observe + the outputs. + """ + + def _output_hook(mod: nn.Module, inp: torch.Tensor, out: torch.Tensor): + m_name = self.mod2name[mod] + obs = ActivationObserver.find(m_name, group=self.out_obs_group) + obs.observe(out) + + group = ActivationObserver.find_group(self.out_obs_group) + for name in group: + mod = self.name2mod[name] + hook_fn = mod.register_forward_hook(_output_hook) + self._hooks.append(hook_fn) + + def _wrap_decoder_layers(self): + """Method to wrap the decoder layers' forward functions for observing + their key/value cache during batched forward passes.""" + + def _forward(mod, *args, **kwargs): + + mod.to(self.device) + batch_args, batch_kwargs = split_decoder_layer_inputs( + *args, **kwargs) + batch_outputs = [] + samples = len(batch_args) + + m_name = self.mod2name[mod] + k_obs = KVCacheObserver.find(m_name, group=self.key_obs_group) + v_obs = KVCacheObserver.find(m_name, group=self.value_obs_group) + + for i in range(len(batch_args)): + + if k_obs and v_obs: + batch_kwargs[i]['use_cache'] = True + version = parse_version(transformers.__version__) + use_new_cache = type(mod).__name__ == 'LlamaDecoderLayer' + if version > parse_version('4.36.0') and use_new_cache: + from transformers.cache_utils import DynamicCache + batch_kwargs[i]['past_key_value'] = DynamicCache() + + ori_idx = mod.self_attn.layer_idx + mod.self_attn.layer_idx = 0 + + out = self._ori_forwards[mod](*batch_args[i], + **batch_kwargs[i]) + mod.self_attn.layer_idx = ori_idx + + out = list(out) + cache = out.pop(-1) + + key = cache.key_cache.pop(-1) + value = cache.value_cache.pop(-1) + + k_obs.observe(key) + v_obs.observe(value) + else: + out = self._ori_forwards[mod](*batch_args[i], + **batch_kwargs[i]) + out = list(out) + key, value = out.pop(-1) + k_obs.observe(key) + v_obs.observe(value) + + del key, value + torch.cuda.empty_cache() + batch_outputs.append(tuple(out)) + else: + batch_outputs.append(self._ori_forwards[mod]( + *batch_args[i], **batch_kwargs[i])) + + outputs = concat_decoder_layer_outputs(batch_outputs) + + del batch_outputs, batch_args, batch_kwargs, args + mod.to('cpu') + torch.cuda.empty_cache() + max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024 + print(f'{m_name}, samples: {samples}, ' + f'max gpu memory: {max_memory:.2f} GB') + return outputs + + for layer in self.name2layer.values(): + self._ori_forwards[layer] = layer.forward + layer.forward = partial(_forward, layer) + + def collect_inputs_stats(self): + """Collect statistics (min, max, absmax values) of the observed inputs. + + Returns a dictionary with these collected stats. + """ + inputs_stats = { + 'max': {}, + 'min': {}, + 'mean': {}, + 'absmax': {}, + 'absmean': {} + } + obs_group = ActivationObserver.find_group(self.inp_obs_group) + for name, obs in obs_group.items(): + inputs_stats['max'][name] = obs.max_val + inputs_stats['min'][name] = obs.min_val + inputs_stats['mean'][name] = obs.mean_val + inputs_stats['absmax'][name] = obs.absmax_val + inputs_stats['absmean'][name] = obs.absmean_val + return inputs_stats + + def collect_outputs_stats(self): + """Collect statistics (min, max, absmax values) of the observed + outputs. + + Returns a dictionary with these collected stats. + """ + outputs_stats = { + 'max': {}, + 'min': {}, + 'mean': {}, + 'absmax': {}, + 'absmean': {} + } + obs_group = ActivationObserver.find_group(self.out_obs_group) + for name, obs in obs_group.items(): + outputs_stats['max'][name] = obs.max_val + outputs_stats['min'][name] = obs.min_val + outputs_stats['mean'][name] = obs.mean_val + outputs_stats['absmax'][name] = obs.absmax_val + outputs_stats['absmean'][name] = obs.absmean_val + return outputs_stats + + def collect_kv_stats(self): + """Collect statistics (min, max, absmax values) of the observed keys + and values. + + Returns a tuple of two dictionaries with these collected stats. + """ + key_stats = {'max': {}, 'min': {}, 'absmax': {}} + obs_group = KVCacheObserver.find_group(self.key_obs_group) + for name, obs in obs_group.items(): + # print("**name ", name, " obs ", obs) + key_stats['max'][name] = obs.max_val + key_stats['min'][name] = obs.min_val + key_stats['absmax'][name] = obs.absmax_val + + value_stats = {'max': {}, 'min': {}, 'absmax': {}} + obs_group = KVCacheObserver.find_group(self.value_obs_group) + for name, obs in obs_group.items(): + value_stats['max'][name] = obs.max_val + value_stats['min'][name] = obs.min_val + value_stats['absmax'][name] = obs.absmax_val + return key_stats, value_stats + + def export(self, out_dir): + """Export the calibration statistics (inputs, outputs, keys and values) + to specified directory. + + Args: + out_dir (Union[str, Path]): The directory path where the stats + will be saved. + """ + + inp_stats = self.collect_inputs_stats() + torch.save(inp_stats, out_dir / 'inputs_stats.pth') + + out_stats = self.collect_outputs_stats() + torch.save(out_stats, out_dir / 'outputs_stats.pth') + + key_stats, value_stats = self.collect_kv_stats() + torch.save(key_stats, out_dir / 'key_stats.pth') + torch.save(value_stats, out_dir / 'value_stats.pth') + + def calibrate(self, data): + """Forward pass through the model in inference mode with given data.""" + + if type(self.model).__name__ == 'QWenLMHeadModel': + model = self.model.transformer + else: + model = self.model.model + with torch.inference_mode(): + _ = model(data.to(self.device)) + + def __enter__(self): + """Prepares the Calibration object for a 'with' statement by + registering hooks and wrapping layer forward methods.""" + + self._hooks = list() + + self._ori_forwards = {} + for layer in self.name2layer.values(): + self._ori_forwards[layer] = layer.forward + + self._insert_input_observers() + self._insert_output_observers() + self._wrap_decoder_layers() + + def __exit__(self, exc_type, exc_value, traceback): + """Clean up after a 'with' statement by removing registered hooks, + restoring original forward methods, and if no exception occurred, + collecting all gathered statistics and saving them.""" + for h in self._hooks: + h.remove() + + for layer in self.name2layer.values(): + layer.forward = self._ori_forwards[layer] diff --git a/examples/int8/export_kv_params.py b/examples/int8/export_kv_params.py new file mode 100755 index 000000000000..d76a7f4c7b28 --- /dev/null +++ b/examples/int8/export_kv_params.py @@ -0,0 +1,357 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +from pathlib import Path +from typing import Union +import matplotlib.pyplot as plt +import seaborn as sns +import json + +import fire +import numpy as np +import torch + +n_reques=1 +n_layer=32 +n_tokens=1 +kv_head=8 +head_size=128 +plot = False +use_max = False +n_max = 1 +if not use_max: + n_max = 10 + +plt.figure(figsize=(80,40)) +font_size = 20 + +def format(i, x_axis_name, y_axis_name, png_name): + plt.xticks(fontsize=font_size) + plt.yticks(fontsize=font_size) + plt.xlabel(x_axis_name, fontsize=font_size) + plt.ylabel(y_axis_name, fontsize=font_size) + plt.title('layer %i'%i,fontsize=font_size) + plt.rcParams.update({'font.size': font_size}) + plt.savefig(png_name) + +def plot_hideen_size(t:np, png_name, quant_group): # t.shape [n_req, n_layer, input_len, (kv_head*head_size)//quant_group] + t = np.transpose(t, (1,0,2,3)) + t = t.reshape(n_layer, -1) + for i in range(t.shape[0]): + print("Ploting %s layer %i "%(png_name, i)) + y=t[i:i+1].reshape(t.shape[1]) + x = np.arange(kv_head*head_size//quant_group) + x = np.repeat(x, t.shape[1]//(kv_head*head_size//quant_group)) + # print(y.shape) + # print(x.shape) + plt.subplot(4,8,i+1) + plt.plot(x, y, '*') + # plot1=plt.plot(x, y, '*',label=(f'layer %i', i)) + # z1 = np.polyfit(x, y, 4) + # p1 = np.poly1d(z1) + # # print(p1) + # yvals=np.polyval(z1,x) + # plot2=plt.plot(x, yvals, 'r',label=(f'polyfit layer %i', i)) + # plt.legend(loc=4)s + format(i, 'head_idx','scaling factor', png_name) + +def plot_per_value(t:np, png_name, quant_group): + t = np.transpose(t, (1,0,2,3)) + t = t.reshape(n_layer, -1, kv_head*head_size//quant_group) + for i in range(t.shape[0]): + print("Ploting %s layer %i "%(png_name, i)) + y= t[:,i,:] + y = y.tolist() + plt.subplot(4,8,i+1) + sns.histplot(y, bins=100, legend=False) + format(i, 'scaling factor', 'count bin', png_name) + +def loadtxt(txtname, quant_group): + key = np.loadtxt(txtname, delimiter='\n') + key = key.reshape(-1, n_layer, n_tokens, (kv_head*head_size)//quant_group) + return key + +def sorted_np(a:np, axis): + b=np.sort(a, axis)[::-1] + print( " ", a.shape[axis]) + global n_max + if n_max > a.shape[axis]: + n_max=0 + if axis == 0 or (len(a.shape) == 1 and axis ==-1): + c = b[n_max:n_max+1] + elif axis == 1 or (len(a.shape) == 2 and axis ==-1): + c = b[:,n_max:n_max+1] + elif axis == 2 or (len(a.shape) == 3 and axis ==-1): + c = b[:,:,n_max:n_max+1] + elif axis == 3 or (len(a.shape) == 4 and axis ==-1): + c = b[:,:,:,n_max:n_max+1] + return c + +def find_max(tensors, axis): + print(tensors.shape) + sorted_tensor = sorted_np(tensors, axis) + print("sorted_tensor.shape ", sorted_tensor.shape) + # print("sorted_tensor ", sorted_tensor) + # scale = np.reshape(scale, (-1)) + if use_max: + scale = np.max(tensors, axis=axis, keepdims=True) + else: + scale = sorted_tensor + print("scale.shape", scale.shape) + # print("scale, ", scale) + return scale + +def save_txt(save_name, tensor): + with open(save_name,'w', encoding='utf-8') as k_file: + for i in range(tensor.size): + k_file.write("%f\n"%tensor[i]) + +class NumpyEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, (np.int_, np.intc, np.intp, np.int8, + np.int16, np.int32, np.int64, np.uint8, + np.uint16, np.uint32, np.uint64)): + return int(obj) + elif isinstance(obj, (np.float_, np.float16, np.float32,np.float64)): + return float(obj) + elif isinstance(obj, (np.ndarray,)): + return obj.tolist() + return json.JSONEncoder.default(self, obj) + +def values_to_scaling_factor(scale, zp=None): + s = {} + z = {} + scale = np.reshape(scale, (n_layer, -1)) + # np.set_printoptions(threshold=np.inf) + print(scale.shape) + for i in range(scale.shape[0]): + layer_i_s = {} + layer_i_z = {} + for j in range(scale.shape[1]): + layer_i_s[f"%i"%j] = scale[i][j] + # print(scale[i][j]) + if zp is not None: + zp = np.reshape(zp, (n_layer, -1)) + layer_i_z[f"%i"%j] = zp[i][j] + else: + layer_i_z[f"%i"%j] = 0.0 + s[f"%i"%i] = layer_i_s + z[f"%i"%i] = layer_i_z + return s, z + +def save_to_json(out_dir, quant_group, k_scale, v_scale, k_zps=None, v_zps=None): + info = { + "model_type":"llama", + "kv_cache": { + "dtype": "int8", + "scaling_factor": { + } + } + } + data = json.loads(json.dumps(info)) + + if k_zps is not None: + k_s, k_z = values_to_scaling_factor(k_scale, k_zps) + v_s, v_z = values_to_scaling_factor(v_scale, v_zps) + else: + k_s, k_z = values_to_scaling_factor(k_scale) + v_s, v_z = values_to_scaling_factor(v_scale) + + scaling_factor = {"k_scale":k_s} + scaling_factor.update({"v_scale":v_s}) + scaling_factor.update({"k_zero_point":k_z}) + scaling_factor.update({"v_zero_point":v_z}) + # scaling_factor = {"scaling_factor": {k_s_info, v_s_info, k_z_info, v_z_info}} + data['kv_cache']['scaling_factor'] = scaling_factor + # print("json_data ", data) + if quant_group==-1: + json_name = "./kv_cache_scales_layer_level.json" + save_json = os.path.join(out_dir,json_name) + with open(save_json, 'w') as f: + json.dump(data, f, indent=4, cls=NumpyEncoder) + else: + json_name = "./kv_cache_scales_quant_group"+str(quant_group)+".json" + save_json = os.path.join(out_dir,json_name) + with open(save_json, 'w') as f: + json.dump(data, f, indent=4, cls=NumpyEncoder) + +def get_tensors_for_json(lists): + tensor = np.stack(lists, axis=0 ) + tensor_layer_level = torch.Tensor(tensor) + tensor_layer_level,_ = torch.max(tensor_layer_level, 1, True) + tensor_layer_level = tensor_layer_level.numpy() + tensor_layer_level = np.reshape(tensor_layer_level, (-1)).astype("float32") + tensor = np.reshape(tensor, (-1)).astype("float32") + return tensor, tensor_layer_level + +def _export_sym(key_stats: dict, + value_stats: dict, + bits: int, + out_dir: Union[str, Path], + tp: int = 1, + quant_group: int = 32) -> None: + """Export symmetric quantization parameters to specified directory.""" + keys_absmax = key_stats['absmax'] + values_absmax = value_stats['absmax'] + ks_lists, vs_lists = [], [] + for layer_idx, name in enumerate(keys_absmax.keys()): + k_absmax = keys_absmax[name] + v_absmax = values_absmax[name] + + heads, _ = k_absmax.shape + assert heads % tp == 0 + + mp_k_absmax = torch.chunk(k_absmax, tp) + mp_v_absmax = torch.chunk(v_absmax, tp) + for i in range(tp): + k_max = mp_k_absmax[i].reshape(-1, quant_group) + v_max = mp_v_absmax[i].reshape(-1, quant_group) + kmax, k_max_sp = torch.max(k_max, -1, True) + vmax, v_max_sp = torch.max(v_max, -1, True) + + k_scale = kmax / (2**(bits-1) - 1) + v_scale = vmax / (2**(bits-1) - 1) + + ks_lists.append(k_scale) + vs_lists.append(v_scale) + + k_scales, k_scales_layer_level = get_tensors_for_json(ks_lists) + v_scales, v_scales_layer_level = get_tensors_for_json(vs_lists) + # print("kkk ", k_scales.shape) + save_to_json(out_dir, quant_group, k_scales, v_scales) + save_to_json(out_dir, -1, k_scales_layer_level, v_scales_layer_level) + + if plot: + k_png = "savefig_k_cache.png" + v_png = "savefig_v_cache.png" + plot_hideen_size(k_scales, k_png, quant_group) + plt.clf() + plot_hideen_size(v_scales, v_png, quant_group) + plt.clf() + k_png_ = "savefig_k_cache_per_value.png" + v_png_ = "savefig_v_cache_per_value.png" + plot_per_value(k_scales, k_png_, quant_group) + plt.clf() + plot_per_value(v_scales, v_png_, quant_group) + plt.clf() + +def _export_asym(key_stats: dict, + value_stats: dict, + bits: int, + out_dir: Union[str, Path], + tp: int = 1, + quant_group: int = 32) -> None: + """Export asymmetric quantization parameters to specified directory.""" + keys_min = key_stats['min'] + values_min = value_stats['min'] + + keys_max = key_stats['max'] + values_max = value_stats['max'] + # print("key_stat ", type(key_stats)) + # print("value_stat ", type(value_stats)) + # print("key_stat ", key_stats.keys()) + # print("value_stat ", value_stats.keys()) + # print("key_stat ", key_stats) + # print("value_stat ", value_stats) + # print("key_stat[min].shape ", key_stats['min']['model.layers.0'].shape) + # print("value_stat[min].shape ", value_stats['min']['model.layers.0'].shape) + # print("key_stat[min] ", key_stats['min']['model.layers.0']) + # print("value_stat[min] ", value_stats['min']['model.layers.0']) + # print("key_stat[max] ", key_stats['max']['model.layers.0']) + # print("value_stat[max] ", value_stats['max']['model.layers.0']) + # print("key_stat[absmax] ", key_stats['absmax']['model.layers.0']) + # print("value_stat[absmax] ", value_stats['absmax']['model.layers.0']) + ks_lists, vs_lists = [], [] + kz_lists, vz_lists = [], [] + for layer_idx, name in enumerate(keys_min.keys()): + k_max = keys_max[name] + v_max = values_max[name] + + k_min = keys_min[name] + v_min = values_min[name] + + heads, _ = k_min.shape + assert heads % tp == 0 + + tp_k_min = torch.chunk(k_min, tp) + tp_v_min = torch.chunk(v_min, tp) + + tp_k_max = torch.chunk(k_max, tp) + tp_v_max = torch.chunk(v_max, tp) + for i in range(tp): + k_min = tp_k_min[i].reshape(-1, quant_group) + v_min = tp_v_min[i].reshape(-1, quant_group) + k_max = tp_k_max[i].reshape(-1, quant_group) + v_max = tp_v_max[i].reshape(-1, quant_group) + kmin, k_min_sp = torch.min(torch.abs(k_min), -1, True) + vmin, v_min_sp = torch.min(torch.abs(v_min), -1, True) + kmax, k_max_sp = torch.max(torch.abs(k_max), -1, True) + vmax, v_max_sp = torch.max(torch.abs(v_max), -1, True) + + k_scale = (kmax - kmin) / (2**bits - 1) + v_scale = (vmax - vmin) / (2**bits - 1) + k_zp = (kmax + kmin) / 2 + v_zp = (vmax + vmin) / 2 + + ks_lists.append(k_scale) + vs_lists.append(v_scale) + kz_lists.append(k_zp) + vz_lists.append(v_zp) + + k_scales, k_scales_layer_level = get_tensors_for_json(ks_lists) + v_scales, v_scales_layer_level = get_tensors_for_json(vs_lists) + k_zps, k_zps_layer_level = get_tensors_for_json(kz_lists) + v_zps, v_zps_layer_level = get_tensors_for_json(vz_lists) + + # print("kkk ", k_scales.shape) + save_to_json(out_dir, quant_group, k_scales, v_scales, k_zps, v_zps) + save_to_json(out_dir, -1, k_scales_layer_level, v_scales_layer_level, k_zps_layer_level, v_zps_layer_level) + + if plot: + k_png = "savefig_k_cache.png" + v_png = "savefig_v_cache.png" + plot_hideen_size(k_scales, k_png, quant_group) + plt.clf() + plot_hideen_size(v_scales, v_png, quant_group) + plt.clf() + k_png_ = "savefig_k_cache_per_value.png" + v_png_ = "savefig_v_cache_per_value.png" + plot_per_value(k_scales, k_png_, quant_group) + plt.clf() + plot_per_value(v_scales, v_png_, quant_group) + plt.clf() + +def main(work_dir: str, + kv_params_dir: str = './work_dir/', + kv_bits: int = 8, + quant_group: int = 128, + kv_sym: bool = True, + num_tp: int = 1) -> None: + """Main function to export key and value stats. + + Args: + work_dir (Union[str, Path]): Directory path where the stats are saved. + kv_params_dir (Union[str, Path]): Directory path where to + save the results. + kv_bits (int, optional): Number of bits for quantization. + Defaults to 8. + kv_sym (bool, optional): Whether to use symmetric quantizaiton. + Defaults to False. + num_tp (int, optional): Number of tensor parallelism. Defaults to 1. + """ + + work_dir = Path(work_dir) + + tm_dir = Path(kv_params_dir) + tm_dir.mkdir(parents=True, exist_ok=True) + + key_stats = torch.load(work_dir / 'key_stats.pth') + value_stats = torch.load(work_dir / 'value_stats.pth') + + if kv_sym: + _export_sym(key_stats, value_stats, kv_bits, tm_dir, num_tp, quant_group) + else: + _export_asym(key_stats, value_stats, kv_bits, tm_dir, num_tp, quant_group) + + +if __name__ == '__main__': + fire.Fire(main) diff --git a/examples/int8/observer.py b/examples/int8/observer.py new file mode 100755 index 000000000000..cf262492a8e9 --- /dev/null +++ b/examples/int8/observer.py @@ -0,0 +1,195 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Union + +import torch +from torch import nn + + +class GlobalAvailMixin: + """Mixin class to make instances globally available.""" + + _instances: Dict[str, Dict[Union[str, nn.Module], 'GlobalAvailMixin']] = { + 'default': {} + } + + def global_available(self, + key: Union[str, nn.Module] = 'default', + group: str = 'default') -> None: + """Make the instance globally available. + + Args: + key (Union[str, nn.Module], optional): Key to save the instance. + Defaults to 'default'. + group (str, optional): Group to save the instance. + Defaults to 'default'. + """ + self._save_instance(self, key, group) + + @classmethod + def _save_instance(cls, + instance: 'GlobalAvailMixin', + key: Union[str, nn.Module] = 'default', + group: str = 'default') -> None: + """Save the instance. + + Args: + instance (GlobalAvailMixin): Instance to save. + key (Union[str, nn.Module], optional): Key to save the instance. + Defaults to 'default'. + group (str, optional): Group to save the instance. + Defaults to 'default'. + """ + if group not in cls._instances: + assert isinstance(group, str) + cls._instances[group] = {} + + cls._instances[group][key] = instance + + @classmethod + def find(cls, + key: Union[str, nn.Module] = 'default', + group: str = 'default') -> Union[None, 'GlobalAvailMixin']: + """Find an instance by its key and group. + + Args: + key (Union[str, nn.Module], optional): Key of the instance. + Defaults to 'default'. + group (str, optional): Group of the instance. + Defaults to 'default'. + + Returns: + Union[None, GlobalAvailMixin]: The found instance, or None if + it does not exist. + """ + return cls._instances.get(group, {}).get(key) + + @classmethod + def find_group( + cls, + group: str) -> Dict[Union[str, nn.Module], 'GlobalAvailMixin']: + """Find all instances in a group. + + Args: + group (str): Group of the instances. + + Returns: + Dict[Union[str, nn.Module], GlobalAvailMixin]: All instances in + the group. + """ + return cls._instances.get(group, {}) + + @classmethod + def instances( + cls) -> Dict[str, Dict[Union[str, nn.Module], 'GlobalAvailMixin']]: + """Get all instances.""" + return cls._instances + + +class KVCacheObserver(GlobalAvailMixin): + """A class to observe and record the max, min, and absolute max value of + given tensor.""" + + def __init__(self, num_head: int, head_dim: int) -> None: + """Constructor for KVCacheObserver. + + Args: + num_head : Number of heads + head_dim : Dimension of each head + """ + self.num_head = num_head + self.head_dim = head_dim + self.max_val = torch.full((num_head, head_dim), + -torch.inf, + dtype=torch.float16) + self.min_val = torch.full((num_head, head_dim), + torch.inf, + dtype=torch.float16) + self.absmax_val = torch.full((num_head, head_dim), + 0, + dtype=torch.float16) + + @torch.no_grad() + def observe(self, x: torch.Tensor) -> None: + """Function to observe the input tensor and update the max, min, and + absolute max values. + + Args: + x : Input tensor + """ + assert len(x.shape) == 4 + + if x.size(1) == self.num_head and x.size(3) == self.head_dim: + # layout: (bs, heads, seqlen, dims) + x = x.transpose(1, 2) + elif x.size(2) != self.num_head or x.size(3) != self.head_dim: + raise RuntimeError('Unexpected dimensions for x, ' + 'expected (bs, num_head, seqlen, head_dim) ' + 'or (bs, seqlen, num_head, head_dim)') + + # print("x.shape ", x.shape) + # print("x.flatten(0, 1).shape ", x.flatten(0, 1).shape) + # print("x.flatten(0, 1).max(0)[0].shape ", x.flatten(0, 1).max(0)[0].shape) + cur_max = x.flatten(0, 1).max(0)[0].cpu() + cur_min = x.flatten(0, 1).min(0)[0].cpu() + cur_absmax = x.flatten(0, 1).abs().max(0)[0].cpu() + + self.max_val = torch.maximum(self.max_val, cur_max) + self.min_val = torch.minimum(self.min_val, cur_min) + self.absmax_val = torch.maximum(self.absmax_val, cur_absmax) + + +class ActivationObserver(GlobalAvailMixin): + """A class to observe and record the max, min, mean, absolute max, and + absolute mean value of a given tensor. + + Also keeps track of the number of batches observed. + """ + + def __init__(self, dim: int) -> None: + """Constructor for ActivationObserver. + + Args: + dim : Dimension of the tensor + """ + self.dim = dim + self.max_val = torch.full((dim, ), -torch.inf, dtype=torch.float16) + self.min_val = torch.full((dim, ), torch.inf, dtype=torch.float16) + self.absmax_val = torch.full((dim, ), 0, dtype=torch.float16) + self.absmean_val = torch.full((dim, ), 0, dtype=torch.float16) + self.mean_val = torch.full((dim, ), 0, dtype=torch.float16) + self.num_batches_tracked = 0 + + @torch.no_grad() + def observe(self, x: torch.Tensor) -> None: + """Function to observe the input tensor and update the max, min, mean, + absolute max, absolute mean values and number of batches tracked. + + Args: + x : Input tensor + """ + assert len(x.shape) == 3 + assert x.size(2) == self.dim + cur_val = x.flatten(0, 1) + cur_max = cur_val.max(0)[0].cpu() + cur_min = cur_val.min(0)[0].cpu() + cur_mean = cur_val.mean(0).cpu() + + cur_abs = cur_val.abs() + cur_absmax = cur_abs.max(0)[0].cpu() + cur_absmean = cur_abs.mean(0).cpu() + + self.max_val = torch.maximum(self.max_val, cur_max) + self.min_val = torch.minimum(self.min_val, cur_min) + self.absmax_val = torch.maximum(self.absmax_val, cur_absmax) + + # Update mean and absmean value with accumulated sum divided + # by total number of batches + self.mean_val = ( + (self.mean_val * self.num_batches_tracked + cur_mean) / + (self.num_batches_tracked + 1)) + self.absmean_val = ( + (self.absmean_val * self.num_batches_tracked + cur_absmean) / + (self.num_batches_tracked + 1)) + + # Increment the count of batches tracked + self.num_batches_tracked += 1 diff --git a/examples/int8/run_calibrate.sh b/examples/int8/run_calibrate.sh new file mode 100755 index 000000000000..49e4258fbfd2 --- /dev/null +++ b/examples/int8/run_calibrate.sh @@ -0,0 +1,34 @@ +#!/bin/bash +export CUDA_VISIBLE_DEVICES=0 +datasets_path=/home/datasets/ +work_dir=./work_dir/ +datasets_name="ceval_val_cmcc ceval cmmlu cmb medmcqa medqa mmlu" +csv_name=LLaMA-Factory/evaluation/ +log_dir=./cali_log/ +for i in $datasets_name; +do + if [ "$i" == "ceval_val_cmcc" ]; then + calib_dataset_path=${datasets_path} + else + calib_dataset_path=${datasets_path}${csv_name}$i/ + fi + save_dir=${work_dir}$i/pth/ + [ ! -d ${save_dir} ] && mkdir ${save_dir} + [ ! -d ${log_dir} ] && mkdir ${log_dir} + log=${log_dir}llama3-8b-datasets_$i.log + echo "i=$i, calib_dataset_path=${calib_dataset_path}, save_dir=${save_dir}, log=${log}" + python calibrate.py /home/model_weights/Llama3-Chinese-8B-Instruct/ \ + --calib_dataset $i \ + --dataset_path ${calib_dataset_path} \ + --work_dir ${save_dir} \ + --device cuda\ + --calib_samples 128 \ + --calib_seqlen 2048 2>&1|tee ${log} + log=${log_dir}llama3-8b-datasets_${i}_json.log + save_dir_path=${work_dir}$i/ + python export_kv_params.py \ + --work_dir ${save_dir} \ + --kv_params_dir ${save_dir_path} \ + --quant_group 128 2>&1|tee ${log} +done + diff --git a/examples/int8/template.py b/examples/int8/template.py new file mode 100755 index 000000000000..515443b0c1da --- /dev/null +++ b/examples/int8/template.py @@ -0,0 +1,87 @@ +# Copyright 2024 the LlamaFactory team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from enum import Enum, unique +from dataclasses import dataclass +from typing import Dict, List, Sequence, Tuple +CHOICES = ["A", "B", "C", "D"] +SUBJECTS = ["Average", "STEM", "Social Sciences", "Humanities", "Other"] + +@unique +class Role(str, Enum): + USER = "user" + ASSISTANT = "assistant" + SYSTEM = "system" + FUNCTION = "function" + OBSERVATION = "observation" + +@dataclass +class EvalTemplate: + system: str + choice: str + answer: str + + def _parse_example(self, example: Dict[str, str]) -> Tuple[str, str]: + r""" + input: a dict with keys {"question", "A", "B", "C", "D", "answer"} + output: a tuple of (prompt, response) + """ + candidates = [self.choice.format(choice=ch, content=example[ch]) for ch in CHOICES if ch in example] + return "".join([example["question"]] + candidates + [self.answer]), example["answer"] + + def format_example( + self, target_data: Dict[str, str], support_set: Sequence[Dict[str, str]], subject_name: str + ) -> List[Dict[str, str]]: + r""" + Converts dataset examples to messages. + """ + messages = [] + for k in range(len(support_set)): + prompt, response = self._parse_example(support_set[k]) + messages.append({"role": Role.USER.value, "content": prompt}) + messages.append({"role": Role.ASSISTANT.value, "content": response}) + + prompt, response = self._parse_example(target_data) + messages.append({"role": Role.USER.value, "content": prompt}) + messages.append({"role": Role.ASSISTANT.value, "content": response}) + messages[0]["content"] = self.system.format(subject=subject_name) + messages[0]["content"] + return messages + + +eval_templates: Dict[str, "EvalTemplate"] = {} + + +def _register_eval_template(name: str, system: str, choice: str, answer: str) -> None: + eval_templates[name] = EvalTemplate(system=system, choice=choice, answer=answer) + + +def get_eval_template(name: str) -> "EvalTemplate": + eval_template = eval_templates.get(name, None) + assert eval_template is not None, "Template {} does not exist.".format(name) + return eval_template + + +_register_eval_template( + name="en", + system="The following are multiple choice questions (with answers) about {subject}.\n\n", + choice="\n{choice}. {content}", + answer="\nAnswer:", +) + + +_register_eval_template( + name="zh", + system="以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n", + choice="\n{choice}. {content}", + answer="\n答案:", +) diff --git a/examples/int8/utils.py b/examples/int8/utils.py new file mode 100755 index 000000000000..fcd0bf230acf --- /dev/null +++ b/examples/int8/utils.py @@ -0,0 +1,167 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Any, Dict, List, Tuple, Union + +import torch +from torch import nn + + +def split_decoder_layer_inputs( + *args: Union[torch.Tensor, Any], **kwargs: Union[torch.Tensor, Any] +) -> Tuple[List[List[Any]], List[Dict[str, Any]]]: + """This function splits batched decoder layer inputs into individual + elements. + + Args: + *args (Union[torch.Tensor, Any]): Positional arguments which could + be a mix of tensors and other types. + **kwargs (Union[torch.Tensor, Any]): Keyword arguments which could + be a mix of tensors and other types. + + Returns: + Tuple[List[List[Any]], List[Dict[str, Any]]]: A tuple containing two + lists, one for positional arguments, one for keyword arguments. + Each list contains individual elements from the batch. + """ + + if not isinstance(args[0], torch.Tensor): + raise ValueError('The first argument must be a Tensor') + + bs = args[0].size(0) + + batch_args = [] + batch_kwargs = [] + for i in range(bs): + new_args = [] + # Iterate over each argument. If it's a torch.Tensor and its first + # dimension equals the batch size, then get the value corresponding + # to the current index, else directly add the whole value. + for val in args: + if isinstance(val, torch.Tensor) and val.size(0) == bs: + new_args.append(val[i:i + 1]) + else: + new_args.append(val) + + new_kwargs = {} + # Execute the same operation for the keyword arguments. + for name, val in kwargs.items(): + if isinstance(val, torch.Tensor) and val.size(0) == bs: + new_kwargs[name] = val[i:i + 1] + else: + new_kwargs[name] = val + + batch_args.append(new_args) + batch_kwargs.append(new_kwargs) + + return batch_args, batch_kwargs + + +def concat_decoder_layer_outputs( + batch_outputs: List[Tuple[Any]]) -> Tuple[Any]: + """This function concatenates individual decoder layer outputs into a + batched output. + + Args: + batch_outputs (List[Tuple[Any]]): A list of tuples, where each tuple + represents the output from an individual element in the batch. + + Returns: + Tuple[Any]: A tuple representing the batched output. + """ + + num_returns = len(batch_outputs[0]) + + def is_past_key_value(data: Any) -> bool: + """Check whether data is a past key-value pair. + + Args: + data (Any): The data to check. + + Returns: + bool: True if data is a past key-value pair, False otherwise. + """ + flag = isinstance(data, tuple) + flag = flag and len(data) == 2 + flag = flag and isinstance(data[0], torch.Tensor) + flag = flag and isinstance(data[1], torch.Tensor) + return flag + + new_outputs = [] + + # Iterate over all types of return values. + for i in range(num_returns): + # Check if the current element is a past key-value pair. + flag = is_past_key_value(batch_outputs[0][i]) + if flag: + # Concatenate the keys and values separately. + key = torch.cat([out[i][0] for out in batch_outputs]) + value = torch.cat([out[i][1] for out in batch_outputs]) + out_i = (key, value) + else: + # If it's not a past key-value pair, concatenate directly. + out_i = torch.cat([out[i] for out in batch_outputs]) + new_outputs.append(out_i) + + return tuple(new_outputs) + + +def collect_target_modules( + model: nn.Module, + # target: Union[str, type], + target: str, + skip_names: List[str] = None, + prefix: str = '') -> Dict[str, nn.Module]: + """Collects the specific target modules from the model. + + Args: + model : The PyTorch module from which to collect the target modules. + target : The specific target to be collected. It can be a class of a + module or the name of a module. + skip_names : List of names of modules to be skipped during collection. + prefix : A string to be added as a prefix to the module names. + + Returns: + A dictionary mapping from module names to module instances. + """ + + # if isinstance(target, LazyAttr): + # target = target.build() + if skip_names is None: + skip_names = [] + if not isinstance(target, (type, str)): + raise TypeError('Target must be a string (name of the module) ' + 'or a type (class of the module)') + + def _is_target(n, m): + if isinstance(target, str): + return target == type(m).__name__ and n not in skip_names + return isinstance(m, target) and n not in skip_names + + name2mod = {} + for name, mod in model.named_modules(): + m_name = f'{prefix}.{name}' if prefix else name + if _is_target(name, mod): + name2mod[m_name] = mod + return name2mod + + +def bimap_name_mod( + name2mod_mappings: List[Dict[str, nn.Module]] +) -> Tuple[Dict[str, nn.Module], Dict[nn.Module, str]]: + """Generates bidirectional maps from module names to module instances and + vice versa. + + Args: + name2mod_mappings : List of dictionaries each mapping from module + names to module instances. + + Returns: + Two dictionaries providing bidirectional mappings between module + names and module instances. + """ + + name2mod = {} + mod2name = {} + for mapping in name2mod_mappings: + mod2name.update({v: k for k, v in mapping.items()}) + name2mod.update(mapping) + return name2mod, mod2name diff --git a/examples/int8/work_dir/ceval/kv_cache_scales_layer_level.json b/examples/int8/work_dir/ceval/kv_cache_scales_layer_level.json new file mode 100644 index 000000000000..eb3fb499e2c9 --- /dev/null +++ b/examples/int8/work_dir/ceval/kv_cache_scales_layer_level.json @@ -0,0 +1,400 @@ +{ + "model_type": "llama", + "kv_cache": { + "dtype": "int8", + "scaling_factor": { + "k_scale": { + "0": { + "0": 0.1298828125 + }, + "1": { + "0": 0.11419677734375 + }, + "2": { + "0": 0.146240234375 + }, + "3": { + "0": 0.1583251953125 + }, + "4": { + "0": 0.1766357421875 + }, + "5": { + "0": 0.155029296875 + }, + "6": { + "0": 0.1474609375 + }, + "7": { + "0": 0.1646728515625 + }, + "8": { + "0": 0.1824951171875 + }, + "9": { + "0": 0.1763916015625 + }, + "10": { + "0": 0.1644287109375 + }, + "11": { + "0": 0.1651611328125 + }, + "12": { + "0": 0.1641845703125 + }, + "13": { + "0": 0.1453857421875 + }, + "14": { + "0": 0.1622314453125 + }, + "15": { + "0": 0.153076171875 + }, + "16": { + "0": 0.1800537109375 + }, + "17": { + "0": 0.1478271484375 + }, + "18": { + "0": 0.1488037109375 + }, + "19": { + "0": 0.1578369140625 + }, + "20": { + "0": 0.16064453125 + }, + "21": { + "0": 0.169189453125 + }, + "22": { + "0": 0.159423828125 + }, + "23": { + "0": 0.1591796875 + }, + "24": { + "0": 0.16552734375 + }, + "25": { + "0": 0.177734375 + }, + "26": { + "0": 0.177490234375 + }, + "27": { + "0": 0.165283203125 + }, + "28": { + "0": 0.166748046875 + }, + "29": { + "0": 0.2744140625 + }, + "30": { + "0": 0.15283203125 + }, + "31": { + "0": 0.1715087890625 + } + }, + "v_scale": { + "0": { + "0": 0.0043487548828125 + }, + "1": { + "0": 0.026824951171875 + }, + "2": { + "0": 0.014801025390625 + }, + "3": { + "0": 0.021697998046875 + }, + "4": { + "0": 0.0166778564453125 + }, + "5": { + "0": 0.0183868408203125 + }, + "6": { + "0": 0.0255279541015625 + }, + "7": { + "0": 0.02130126953125 + }, + "8": { + "0": 0.0220794677734375 + }, + "9": { + "0": 0.0289154052734375 + }, + "10": { + "0": 0.0200042724609375 + }, + "11": { + "0": 0.0249176025390625 + }, + "12": { + "0": 0.020233154296875 + }, + "13": { + "0": 0.0272369384765625 + }, + "14": { + "0": 0.0230712890625 + }, + "15": { + "0": 0.02984619140625 + }, + "16": { + "0": 0.0198822021484375 + }, + "17": { + "0": 0.0206298828125 + }, + "18": { + "0": 0.0265045166015625 + }, + "19": { + "0": 0.02459716796875 + }, + "20": { + "0": 0.0234375 + }, + "21": { + "0": 0.0258941650390625 + }, + "22": { + "0": 0.035430908203125 + }, + "23": { + "0": 0.028411865234375 + }, + "24": { + "0": 0.0478515625 + }, + "25": { + "0": 0.03515625 + }, + "26": { + "0": 0.036163330078125 + }, + "27": { + "0": 0.044891357421875 + }, + "28": { + "0": 0.04412841796875 + }, + "29": { + "0": 0.054107666015625 + }, + "30": { + "0": 0.053497314453125 + }, + "31": { + "0": 0.05218505859375 + } + }, + "k_zero_point": { + "0": { + "0": 0.0 + }, + "1": { + "0": 0.0 + }, + "2": { + "0": 0.0 + }, + "3": { + "0": 0.0 + }, + "4": { + "0": 0.0 + }, + "5": { + "0": 0.0 + }, + "6": { + "0": 0.0 + }, + "7": { + "0": 0.0 + }, + "8": { + "0": 0.0 + }, + "9": { + "0": 0.0 + }, + "10": { + "0": 0.0 + }, + "11": { + "0": 0.0 + }, + "12": { + "0": 0.0 + }, + "13": { + "0": 0.0 + }, + "14": { + "0": 0.0 + }, + "15": { + "0": 0.0 + }, + "16": { + "0": 0.0 + }, + "17": { + "0": 0.0 + }, + "18": { + "0": 0.0 + }, + "19": { + "0": 0.0 + }, + "20": { + "0": 0.0 + }, + "21": { + "0": 0.0 + }, + "22": { + "0": 0.0 + }, + "23": { + "0": 0.0 + }, + "24": { + "0": 0.0 + }, + "25": { + "0": 0.0 + }, + "26": { + "0": 0.0 + }, + "27": { + "0": 0.0 + }, + "28": { + "0": 0.0 + }, + "29": { + "0": 0.0 + }, + "30": { + "0": 0.0 + }, + "31": { + "0": 0.0 + } + }, + "v_zero_point": { + "0": { + "0": 0.0 + }, + "1": { + "0": 0.0 + }, + "2": { + "0": 0.0 + }, + "3": { + "0": 0.0 + }, + "4": { + "0": 0.0 + }, + "5": { + "0": 0.0 + }, + "6": { + "0": 0.0 + }, + "7": { + "0": 0.0 + }, + "8": { + "0": 0.0 + }, + "9": { + "0": 0.0 + }, + "10": { + "0": 0.0 + }, + "11": { + "0": 0.0 + }, + "12": { + "0": 0.0 + }, + "13": { + "0": 0.0 + }, + "14": { + "0": 0.0 + }, + "15": { + "0": 0.0 + }, + "16": { + "0": 0.0 + }, + "17": { + "0": 0.0 + }, + "18": { + "0": 0.0 + }, + "19": { + "0": 0.0 + }, + "20": { + "0": 0.0 + }, + "21": { + "0": 0.0 + }, + "22": { + "0": 0.0 + }, + "23": { + "0": 0.0 + }, + "24": { + "0": 0.0 + }, + "25": { + "0": 0.0 + }, + "26": { + "0": 0.0 + }, + "27": { + "0": 0.0 + }, + "28": { + "0": 0.0 + }, + "29": { + "0": 0.0 + }, + "30": { + "0": 0.0 + }, + "31": { + "0": 0.0 + } + } + } + } +} \ No newline at end of file diff --git a/examples/int8/work_dir/ceval/kv_cache_scales_quant_group128.json b/examples/int8/work_dir/ceval/kv_cache_scales_quant_group128.json new file mode 100644 index 000000000000..0459f3f01d75 --- /dev/null +++ b/examples/int8/work_dir/ceval/kv_cache_scales_quant_group128.json @@ -0,0 +1,1296 @@ +{ + "model_type": "llama", + "kv_cache": { + "dtype": "int8", + "scaling_factor": { + "k_scale": { + "0": { + "0": 0.097412109375, + "1": 0.07586669921875, + "2": 0.08447265625, + "3": 0.05828857421875, + "4": 0.07098388671875, + "5": 0.061798095703125, + "6": 0.1298828125, + "7": 0.050567626953125 + }, + "1": { + "0": 0.088623046875, + "1": 0.11419677734375, + "2": 0.10955810546875, + "3": 0.0762939453125, + "4": 0.10211181640625, + "5": 0.07965087890625, + "6": 0.08648681640625, + "7": 0.08721923828125 + }, + "2": { + "0": 0.1270751953125, + "1": 0.146240234375, + "2": 0.1002197265625, + "3": 0.1373291015625, + "4": 0.10894775390625, + "5": 0.12646484375, + "6": 0.1400146484375, + "7": 0.126220703125 + }, + "3": { + "0": 0.1300048828125, + "1": 0.09222412109375, + "2": 0.152099609375, + "3": 0.13330078125, + "4": 0.1307373046875, + "5": 0.1240234375, + "6": 0.1275634765625, + "7": 0.1583251953125 + }, + "4": { + "0": 0.0838623046875, + "1": 0.1314697265625, + "2": 0.1290283203125, + "3": 0.10797119140625, + "4": 0.1766357421875, + "5": 0.11175537109375, + "6": 0.12457275390625, + "7": 0.1300048828125 + }, + "5": { + "0": 0.109375, + "1": 0.11749267578125, + "2": 0.12384033203125, + "3": 0.11236572265625, + "4": 0.1055908203125, + "5": 0.1197509765625, + "6": 0.155029296875, + "7": 0.10772705078125 + }, + "6": { + "0": 0.12347412109375, + "1": 0.147216796875, + "2": 0.13720703125, + "3": 0.09906005859375, + "4": 0.1474609375, + "5": 0.13427734375, + "6": 0.109619140625, + "7": 0.1448974609375 + }, + "7": { + "0": 0.10699462890625, + "1": 0.1612548828125, + "2": 0.128173828125, + "3": 0.1463623046875, + "4": 0.15234375, + "5": 0.12255859375, + "6": 0.12298583984375, + "7": 0.1646728515625 + }, + "8": { + "0": 0.12152099609375, + "1": 0.11041259765625, + "2": 0.1824951171875, + "3": 0.158935546875, + "4": 0.15380859375, + "5": 0.1302490234375, + "6": 0.1239013671875, + "7": 0.1318359375 + }, + "9": { + "0": 0.140380859375, + "1": 0.1204833984375, + "2": 0.1763916015625, + "3": 0.132080078125, + "4": 0.137939453125, + "5": 0.124755859375, + "6": 0.10223388671875, + "7": 0.11492919921875 + }, + "10": { + "0": 0.164306640625, + "1": 0.125244140625, + "2": 0.1051025390625, + "3": 0.11895751953125, + "4": 0.111572265625, + "5": 0.1644287109375, + "6": 0.1334228515625, + "7": 0.1522216796875 + }, + "11": { + "0": 0.1397705078125, + "1": 0.129638671875, + "2": 0.1387939453125, + "3": 0.131103515625, + "4": 0.12548828125, + "5": 0.119873046875, + "6": 0.1651611328125, + "7": 0.11895751953125 + }, + "12": { + "0": 0.1429443359375, + "1": 0.1539306640625, + "2": 0.160888671875, + "3": 0.10943603515625, + "4": 0.14501953125, + "5": 0.1641845703125, + "6": 0.1573486328125, + "7": 0.1533203125 + }, + "13": { + "0": 0.127685546875, + "1": 0.1453857421875, + "2": 0.1297607421875, + "3": 0.1285400390625, + "4": 0.1431884765625, + "5": 0.132568359375, + "6": 0.1279296875, + "7": 0.1275634765625 + }, + "14": { + "0": 0.1314697265625, + "1": 0.1397705078125, + "2": 0.1622314453125, + "3": 0.142333984375, + "4": 0.15966796875, + "5": 0.1458740234375, + "6": 0.11279296875, + "7": 0.1356201171875 + }, + "15": { + "0": 0.1258544921875, + "1": 0.1512451171875, + "2": 0.133544921875, + "3": 0.1407470703125, + "4": 0.08563232421875, + "5": 0.153076171875, + "6": 0.1448974609375, + "7": 0.11273193359375 + }, + "16": { + "0": 0.1500244140625, + "1": 0.1593017578125, + "2": 0.13916015625, + "3": 0.1800537109375, + "4": 0.12322998046875, + "5": 0.1221923828125, + "6": 0.1160888671875, + "7": 0.1483154296875 + }, + "17": { + "0": 0.12939453125, + "1": 0.1478271484375, + "2": 0.095458984375, + "3": 0.1302490234375, + "4": 0.1182861328125, + "5": 0.141845703125, + "6": 0.1318359375, + "7": 0.138427734375 + }, + "18": { + "0": 0.1168212890625, + "1": 0.1156005859375, + "2": 0.1220703125, + "3": 0.131103515625, + "4": 0.135498046875, + "5": 0.12054443359375, + "6": 0.1488037109375, + "7": 0.1444091796875 + }, + "19": { + "0": 0.11004638671875, + "1": 0.12005615234375, + "2": 0.1578369140625, + "3": 0.1260986328125, + "4": 0.0750732421875, + "5": 0.10833740234375, + "6": 0.1395263671875, + "7": 0.11346435546875 + }, + "20": { + "0": 0.0986328125, + "1": 0.16064453125, + "2": 0.1185302734375, + "3": 0.108154296875, + "4": 0.1318359375, + "5": 0.15283203125, + "6": 0.12646484375, + "7": 0.12078857421875 + }, + "21": { + "0": 0.132568359375, + "1": 0.10723876953125, + "2": 0.169189453125, + "3": 0.1300048828125, + "4": 0.1533203125, + "5": 0.1324462890625, + "6": 0.1654052734375, + "7": 0.12030029296875 + }, + "22": { + "0": 0.1199951171875, + "1": 0.159423828125, + "2": 0.1376953125, + "3": 0.12298583984375, + "4": 0.1092529296875, + "5": 0.1387939453125, + "6": 0.137451171875, + "7": 0.1434326171875 + }, + "23": { + "0": 0.154296875, + "1": 0.1077880859375, + "2": 0.1314697265625, + "3": 0.1278076171875, + "4": 0.149169921875, + "5": 0.114990234375, + "6": 0.1591796875, + "7": 0.1563720703125 + }, + "24": { + "0": 0.138916015625, + "1": 0.160400390625, + "2": 0.16552734375, + "3": 0.1451416015625, + "4": 0.107421875, + "5": 0.138671875, + "6": 0.12744140625, + "7": 0.132080078125 + }, + "25": { + "0": 0.10772705078125, + "1": 0.1131591796875, + "2": 0.13232421875, + "3": 0.1038818359375, + "4": 0.177734375, + "5": 0.1641845703125, + "6": 0.168212890625, + "7": 0.164306640625 + }, + "26": { + "0": 0.177490234375, + "1": 0.154052734375, + "2": 0.11138916015625, + "3": 0.11676025390625, + "4": 0.166259765625, + "5": 0.148681640625, + "6": 0.1492919921875, + "7": 0.1375732421875 + }, + "27": { + "0": 0.1578369140625, + "1": 0.11749267578125, + "2": 0.155517578125, + "3": 0.1304931640625, + "4": 0.15283203125, + "5": 0.1265869140625, + "6": 0.165283203125, + "7": 0.11944580078125 + }, + "28": { + "0": 0.136962890625, + "1": 0.1541748046875, + "2": 0.166748046875, + "3": 0.13134765625, + "4": 0.142333984375, + "5": 0.1431884765625, + "6": 0.1170654296875, + "7": 0.14013671875 + }, + "29": { + "0": 0.1473388671875, + "1": 0.1697998046875, + "2": 0.1317138671875, + "3": 0.1513671875, + "4": 0.12384033203125, + "5": 0.11541748046875, + "6": 0.2744140625, + "7": 0.15869140625 + }, + "30": { + "0": 0.11639404296875, + "1": 0.15283203125, + "2": 0.1400146484375, + "3": 0.13623046875, + "4": 0.113037109375, + "5": 0.12286376953125, + "6": 0.152099609375, + "7": 0.130126953125 + }, + "31": { + "0": 0.1715087890625, + "1": 0.11456298828125, + "2": 0.1407470703125, + "3": 0.1402587890625, + "4": 0.12548828125, + "5": 0.120849609375, + "6": 0.135009765625, + "7": 0.114990234375 + } + }, + "v_scale": { + "0": { + "0": 0.003154754638671875, + "1": 0.003971099853515625, + "2": 0.003414154052734375, + "3": 0.002643585205078125, + "4": 0.0043487548828125, + "5": 0.00251007080078125, + "6": 0.00362396240234375, + "7": 0.0037975311279296875 + }, + "1": { + "0": 0.005405426025390625, + "1": 0.0030059814453125, + "2": 0.01439666748046875, + "3": 0.00389862060546875, + "4": 0.005878448486328125, + "5": 0.005405426025390625, + "6": 0.0071258544921875, + "7": 0.026824951171875 + }, + "2": { + "0": 0.00878143310546875, + "1": 0.0132904052734375, + "2": 0.009307861328125, + "3": 0.0104522705078125, + "4": 0.01003265380859375, + "5": 0.01093292236328125, + "6": 0.014801025390625, + "7": 0.00994873046875 + }, + "3": { + "0": 0.01197052001953125, + "1": 0.01363372802734375, + "2": 0.0164642333984375, + "3": 0.0101165771484375, + "4": 0.021697998046875, + "5": 0.016326904296875, + "6": 0.01226806640625, + "7": 0.019378662109375 + }, + "4": { + "0": 0.0139007568359375, + "1": 0.01061248779296875, + "2": 0.01143646240234375, + "3": 0.0157928466796875, + "4": 0.01212310791015625, + "5": 0.009979248046875, + "6": 0.0166778564453125, + "7": 0.01116180419921875 + }, + "5": { + "0": 0.0183868408203125, + "1": 0.01389312744140625, + "2": 0.0174713134765625, + "3": 0.01357269287109375, + "4": 0.0124969482421875, + "5": 0.0167083740234375, + "6": 0.01593017578125, + "7": 0.017822265625 + }, + "6": { + "0": 0.017852783203125, + "1": 0.014984130859375, + "2": 0.0153656005859375, + "3": 0.0200042724609375, + "4": 0.01776123046875, + "5": 0.0164947509765625, + "6": 0.0255279541015625, + "7": 0.01137542724609375 + }, + "7": { + "0": 0.018218994140625, + "1": 0.02130126953125, + "2": 0.0161895751953125, + "3": 0.01338958740234375, + "4": 0.018157958984375, + "5": 0.01232147216796875, + "6": 0.01495361328125, + "7": 0.019073486328125 + }, + "8": { + "0": 0.0130767822265625, + "1": 0.0220794677734375, + "2": 0.016204833984375, + "3": 0.0174102783203125, + "4": 0.0191802978515625, + "5": 0.015899658203125, + "6": 0.01369476318359375, + "7": 0.0148468017578125 + }, + "9": { + "0": 0.0140838623046875, + "1": 0.0289154052734375, + "2": 0.02130126953125, + "3": 0.01824951171875, + "4": 0.0164642333984375, + "5": 0.0174713134765625, + "6": 0.017608642578125, + "7": 0.018402099609375 + }, + "10": { + "0": 0.0175323486328125, + "1": 0.0187835693359375, + "2": 0.0200042724609375, + "3": 0.01288604736328125, + "4": 0.01253509521484375, + "5": 0.01348114013671875, + "6": 0.0145721435546875, + "7": 0.01348114013671875 + }, + "11": { + "0": 0.0166778564453125, + "1": 0.0160064697265625, + "2": 0.0217742919921875, + "3": 0.0177764892578125, + "4": 0.0158843994140625, + "5": 0.0249176025390625, + "6": 0.0235595703125, + "7": 0.0149383544921875 + }, + "12": { + "0": 0.0171966552734375, + "1": 0.0185394287109375, + "2": 0.019500732421875, + "3": 0.016876220703125, + "4": 0.020233154296875, + "5": 0.017364501953125, + "6": 0.01898193359375, + "7": 0.01885986328125 + }, + "13": { + "0": 0.0172576904296875, + "1": 0.019195556640625, + "2": 0.0185394287109375, + "3": 0.0159149169921875, + "4": 0.020843505859375, + "5": 0.0206451416015625, + "6": 0.01641845703125, + "7": 0.0272369384765625 + }, + "14": { + "0": 0.017822265625, + "1": 0.0169525146484375, + "2": 0.0230712890625, + "3": 0.0157318115234375, + "4": 0.0205230712890625, + "5": 0.0190887451171875, + "6": 0.0199737548828125, + "7": 0.0170135498046875 + }, + "15": { + "0": 0.016693115234375, + "1": 0.01515960693359375, + "2": 0.0175933837890625, + "3": 0.0155487060546875, + "4": 0.02984619140625, + "5": 0.0170745849609375, + "6": 0.0236663818359375, + "7": 0.0191650390625 + }, + "16": { + "0": 0.01739501953125, + "1": 0.01314544677734375, + "2": 0.01227569580078125, + "3": 0.0198211669921875, + "4": 0.01727294921875, + "5": 0.0187530517578125, + "6": 0.0152130126953125, + "7": 0.0198822021484375 + }, + "17": { + "0": 0.0140380859375, + "1": 0.0167236328125, + "2": 0.01544189453125, + "3": 0.011993408203125, + "4": 0.01654052734375, + "5": 0.016082763671875, + "6": 0.0206298828125, + "7": 0.01654052734375 + }, + "18": { + "0": 0.01206207275390625, + "1": 0.0265045166015625, + "2": 0.0192108154296875, + "3": 0.0170745849609375, + "4": 0.025604248046875, + "5": 0.023468017578125, + "6": 0.021453857421875, + "7": 0.0167388916015625 + }, + "19": { + "0": 0.0161590576171875, + "1": 0.021026611328125, + "2": 0.01546478271484375, + "3": 0.01800537109375, + "4": 0.0180511474609375, + "5": 0.02459716796875, + "6": 0.0172119140625, + "7": 0.0237579345703125 + }, + "20": { + "0": 0.02044677734375, + "1": 0.0234375, + "2": 0.016845703125, + "3": 0.021026611328125, + "4": 0.0220184326171875, + "5": 0.02044677734375, + "6": 0.0188446044921875, + "7": 0.020721435546875 + }, + "21": { + "0": 0.0131988525390625, + "1": 0.0258941650390625, + "2": 0.020172119140625, + "3": 0.0177001953125, + "4": 0.0175933837890625, + "5": 0.0248260498046875, + "6": 0.0190582275390625, + "7": 0.021759033203125 + }, + "22": { + "0": 0.02996826171875, + "1": 0.0155487060546875, + "2": 0.018463134765625, + "3": 0.035430908203125, + "4": 0.030181884765625, + "5": 0.0168304443359375, + "6": 0.016265869140625, + "7": 0.03485107421875 + }, + "23": { + "0": 0.019775390625, + "1": 0.028411865234375, + "2": 0.017059326171875, + "3": 0.022705078125, + "4": 0.0172882080078125, + "5": 0.0252227783203125, + "6": 0.0189971923828125, + "7": 0.0240936279296875 + }, + "24": { + "0": 0.0286102294921875, + "1": 0.019439697265625, + "2": 0.0214691162109375, + "3": 0.0253753662109375, + "4": 0.03265380859375, + "5": 0.0292816162109375, + "6": 0.0478515625, + "7": 0.0278167724609375 + }, + "25": { + "0": 0.0233001708984375, + "1": 0.033172607421875, + "2": 0.01971435546875, + "3": 0.034149169921875, + "4": 0.0211334228515625, + "5": 0.03515625, + "6": 0.0159454345703125, + "7": 0.023773193359375 + }, + "26": { + "0": 0.0233306884765625, + "1": 0.0200042724609375, + "2": 0.0195465087890625, + "3": 0.036163330078125, + "4": 0.0215606689453125, + "5": 0.032928466796875, + "6": 0.0188446044921875, + "7": 0.0247650146484375 + }, + "27": { + "0": 0.02325439453125, + "1": 0.041290283203125, + "2": 0.02734375, + "3": 0.0179290771484375, + "4": 0.0268402099609375, + "5": 0.044891357421875, + "6": 0.019622802734375, + "7": 0.0302276611328125 + }, + "28": { + "0": 0.0233917236328125, + "1": 0.04412841796875, + "2": 0.0293731689453125, + "3": 0.035919189453125, + "4": 0.0293426513671875, + "5": 0.02978515625, + "6": 0.0274810791015625, + "7": 0.0169677734375 + }, + "29": { + "0": 0.028594970703125, + "1": 0.02667236328125, + "2": 0.028839111328125, + "3": 0.0227203369140625, + "4": 0.035064697265625, + "5": 0.054107666015625, + "6": 0.051300048828125, + "7": 0.0281829833984375 + }, + "30": { + "0": 0.0258331298828125, + "1": 0.0223541259765625, + "2": 0.029876708984375, + "3": 0.053497314453125, + "4": 0.029876708984375, + "5": 0.0284576416015625, + "6": 0.034820556640625, + "7": 0.04473876953125 + }, + "31": { + "0": 0.0309906005859375, + "1": 0.027496337890625, + "2": 0.023895263671875, + "3": 0.05218505859375, + "4": 0.0271453857421875, + "5": 0.039642333984375, + "6": 0.029144287109375, + "7": 0.019866943359375 + } + }, + "k_zero_point": { + "0": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "1": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "2": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "3": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "4": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "5": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "6": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "7": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "8": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "9": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "10": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "11": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "12": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "13": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "14": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "15": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "16": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "17": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "18": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "19": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "20": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "21": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "22": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "23": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "24": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "25": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "26": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "27": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "28": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "29": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "30": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "31": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + } + }, + "v_zero_point": { + "0": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "1": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "2": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "3": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "4": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "5": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "6": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "7": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "8": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "9": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "10": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "11": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "12": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "13": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "14": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "15": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "16": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "17": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "18": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "19": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "20": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "21": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "22": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "23": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "24": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "25": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "26": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "27": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "28": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "29": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "30": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "31": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + } + } + } + } +} \ No newline at end of file diff --git a/examples/int8/work_dir/ceval_val_cmcc/kv_cache_scales_layer_level.json b/examples/int8/work_dir/ceval_val_cmcc/kv_cache_scales_layer_level.json new file mode 100644 index 000000000000..4534943078b3 --- /dev/null +++ b/examples/int8/work_dir/ceval_val_cmcc/kv_cache_scales_layer_level.json @@ -0,0 +1,400 @@ +{ + "model_type": "llama", + "kv_cache": { + "dtype": "int8", + "scaling_factor": { + "k_scale": { + "0": { + "0": 0.1224365234375 + }, + "1": { + "0": 0.11419677734375 + }, + "2": { + "0": 0.14453125 + }, + "3": { + "0": 0.1549072265625 + }, + "4": { + "0": 0.170166015625 + }, + "5": { + "0": 0.1585693359375 + }, + "6": { + "0": 0.145263671875 + }, + "7": { + "0": 0.1636962890625 + }, + "8": { + "0": 0.170166015625 + }, + "9": { + "0": 0.1697998046875 + }, + "10": { + "0": 0.166015625 + }, + "11": { + "0": 0.1634521484375 + }, + "12": { + "0": 0.16259765625 + }, + "13": { + "0": 0.1453857421875 + }, + "14": { + "0": 0.17041015625 + }, + "15": { + "0": 0.1512451171875 + }, + "16": { + "0": 0.179443359375 + }, + "17": { + "0": 0.1502685546875 + }, + "18": { + "0": 0.15234375 + }, + "19": { + "0": 0.16357421875 + }, + "20": { + "0": 0.1566162109375 + }, + "21": { + "0": 0.1685791015625 + }, + "22": { + "0": 0.153076171875 + }, + "23": { + "0": 0.1544189453125 + }, + "24": { + "0": 0.1654052734375 + }, + "25": { + "0": 0.1737060546875 + }, + "26": { + "0": 0.16748046875 + }, + "27": { + "0": 0.162841796875 + }, + "28": { + "0": 0.1656494140625 + }, + "29": { + "0": 0.2783203125 + }, + "30": { + "0": 0.147216796875 + }, + "31": { + "0": 0.1688232421875 + } + }, + "v_scale": { + "0": { + "0": 0.003971099853515625 + }, + "1": { + "0": 0.026824951171875 + }, + "2": { + "0": 0.015594482421875 + }, + "3": { + "0": 0.0190582275390625 + }, + "4": { + "0": 0.0164794921875 + }, + "5": { + "0": 0.0184783935546875 + }, + "6": { + "0": 0.0222320556640625 + }, + "7": { + "0": 0.0212860107421875 + }, + "8": { + "0": 0.0237579345703125 + }, + "9": { + "0": 0.028564453125 + }, + "10": { + "0": 0.0199737548828125 + }, + "11": { + "0": 0.0253753662109375 + }, + "12": { + "0": 0.0201416015625 + }, + "13": { + "0": 0.0236663818359375 + }, + "14": { + "0": 0.023468017578125 + }, + "15": { + "0": 0.03094482421875 + }, + "16": { + "0": 0.0203094482421875 + }, + "17": { + "0": 0.019073486328125 + }, + "18": { + "0": 0.0285797119140625 + }, + "19": { + "0": 0.0259857177734375 + }, + "20": { + "0": 0.0259552001953125 + }, + "21": { + "0": 0.0256500244140625 + }, + "22": { + "0": 0.031890869140625 + }, + "23": { + "0": 0.0276641845703125 + }, + "24": { + "0": 0.04351806640625 + }, + "25": { + "0": 0.035491943359375 + }, + "26": { + "0": 0.034912109375 + }, + "27": { + "0": 0.04412841796875 + }, + "28": { + "0": 0.042205810546875 + }, + "29": { + "0": 0.055694580078125 + }, + "30": { + "0": 0.05181884765625 + }, + "31": { + "0": 0.05096435546875 + } + }, + "k_zero_point": { + "0": { + "0": 0.0 + }, + "1": { + "0": 0.0 + }, + "2": { + "0": 0.0 + }, + "3": { + "0": 0.0 + }, + "4": { + "0": 0.0 + }, + "5": { + "0": 0.0 + }, + "6": { + "0": 0.0 + }, + "7": { + "0": 0.0 + }, + "8": { + "0": 0.0 + }, + "9": { + "0": 0.0 + }, + "10": { + "0": 0.0 + }, + "11": { + "0": 0.0 + }, + "12": { + "0": 0.0 + }, + "13": { + "0": 0.0 + }, + "14": { + "0": 0.0 + }, + "15": { + "0": 0.0 + }, + "16": { + "0": 0.0 + }, + "17": { + "0": 0.0 + }, + "18": { + "0": 0.0 + }, + "19": { + "0": 0.0 + }, + "20": { + "0": 0.0 + }, + "21": { + "0": 0.0 + }, + "22": { + "0": 0.0 + }, + "23": { + "0": 0.0 + }, + "24": { + "0": 0.0 + }, + "25": { + "0": 0.0 + }, + "26": { + "0": 0.0 + }, + "27": { + "0": 0.0 + }, + "28": { + "0": 0.0 + }, + "29": { + "0": 0.0 + }, + "30": { + "0": 0.0 + }, + "31": { + "0": 0.0 + } + }, + "v_zero_point": { + "0": { + "0": 0.0 + }, + "1": { + "0": 0.0 + }, + "2": { + "0": 0.0 + }, + "3": { + "0": 0.0 + }, + "4": { + "0": 0.0 + }, + "5": { + "0": 0.0 + }, + "6": { + "0": 0.0 + }, + "7": { + "0": 0.0 + }, + "8": { + "0": 0.0 + }, + "9": { + "0": 0.0 + }, + "10": { + "0": 0.0 + }, + "11": { + "0": 0.0 + }, + "12": { + "0": 0.0 + }, + "13": { + "0": 0.0 + }, + "14": { + "0": 0.0 + }, + "15": { + "0": 0.0 + }, + "16": { + "0": 0.0 + }, + "17": { + "0": 0.0 + }, + "18": { + "0": 0.0 + }, + "19": { + "0": 0.0 + }, + "20": { + "0": 0.0 + }, + "21": { + "0": 0.0 + }, + "22": { + "0": 0.0 + }, + "23": { + "0": 0.0 + }, + "24": { + "0": 0.0 + }, + "25": { + "0": 0.0 + }, + "26": { + "0": 0.0 + }, + "27": { + "0": 0.0 + }, + "28": { + "0": 0.0 + }, + "29": { + "0": 0.0 + }, + "30": { + "0": 0.0 + }, + "31": { + "0": 0.0 + } + } + } + } +} \ No newline at end of file diff --git a/examples/int8/work_dir/ceval_val_cmcc/kv_cache_scales_quant_group128.json b/examples/int8/work_dir/ceval_val_cmcc/kv_cache_scales_quant_group128.json new file mode 100644 index 000000000000..8134fd48902b --- /dev/null +++ b/examples/int8/work_dir/ceval_val_cmcc/kv_cache_scales_quant_group128.json @@ -0,0 +1,1296 @@ +{ + "model_type": "llama", + "kv_cache": { + "dtype": "int8", + "scaling_factor": { + "k_scale": { + "0": { + "0": 0.09735107421875, + "1": 0.0758056640625, + "2": 0.08447265625, + "3": 0.060577392578125, + "4": 0.06951904296875, + "5": 0.06494140625, + "6": 0.1224365234375, + "7": 0.05035400390625 + }, + "1": { + "0": 0.09149169921875, + "1": 0.11419677734375, + "2": 0.10955810546875, + "3": 0.07708740234375, + "4": 0.10235595703125, + "5": 0.08319091796875, + "6": 0.08880615234375, + "7": 0.08721923828125 + }, + "2": { + "0": 0.12481689453125, + "1": 0.14453125, + "2": 0.1004638671875, + "3": 0.133544921875, + "4": 0.10809326171875, + "5": 0.124755859375, + "6": 0.1361083984375, + "7": 0.1253662109375 + }, + "3": { + "0": 0.1282958984375, + "1": 0.095458984375, + "2": 0.1502685546875, + "3": 0.130126953125, + "4": 0.1300048828125, + "5": 0.12127685546875, + "6": 0.120361328125, + "7": 0.1549072265625 + }, + "4": { + "0": 0.08135986328125, + "1": 0.13232421875, + "2": 0.1331787109375, + "3": 0.10552978515625, + "4": 0.170166015625, + "5": 0.11163330078125, + "6": 0.1251220703125, + "7": 0.13037109375 + }, + "5": { + "0": 0.106201171875, + "1": 0.11895751953125, + "2": 0.11651611328125, + "3": 0.1103515625, + "4": 0.10992431640625, + "5": 0.1312255859375, + "6": 0.1585693359375, + "7": 0.109130859375 + }, + "6": { + "0": 0.1258544921875, + "1": 0.145263671875, + "2": 0.135009765625, + "3": 0.09698486328125, + "4": 0.1448974609375, + "5": 0.13427734375, + "6": 0.1102294921875, + "7": 0.140625 + }, + "7": { + "0": 0.10870361328125, + "1": 0.154052734375, + "2": 0.1302490234375, + "3": 0.147705078125, + "4": 0.1490478515625, + "5": 0.1253662109375, + "6": 0.11676025390625, + "7": 0.1636962890625 + }, + "8": { + "0": 0.1171875, + "1": 0.11273193359375, + "2": 0.170166015625, + "3": 0.1544189453125, + "4": 0.1483154296875, + "5": 0.128173828125, + "6": 0.1297607421875, + "7": 0.11956787109375 + }, + "9": { + "0": 0.149658203125, + "1": 0.1199951171875, + "2": 0.1697998046875, + "3": 0.125732421875, + "4": 0.1361083984375, + "5": 0.12322998046875, + "6": 0.10052490234375, + "7": 0.11590576171875 + }, + "10": { + "0": 0.1591796875, + "1": 0.12164306640625, + "2": 0.1187744140625, + "3": 0.11285400390625, + "4": 0.11395263671875, + "5": 0.166015625, + "6": 0.1383056640625, + "7": 0.14697265625 + }, + "11": { + "0": 0.1385498046875, + "1": 0.124755859375, + "2": 0.1351318359375, + "3": 0.1357421875, + "4": 0.1187744140625, + "5": 0.1220703125, + "6": 0.1634521484375, + "7": 0.1141357421875 + }, + "12": { + "0": 0.15234375, + "1": 0.14990234375, + "2": 0.16259765625, + "3": 0.11126708984375, + "4": 0.1456298828125, + "5": 0.16015625, + "6": 0.1514892578125, + "7": 0.1531982421875 + }, + "13": { + "0": 0.1265869140625, + "1": 0.1453857421875, + "2": 0.138427734375, + "3": 0.1273193359375, + "4": 0.142822265625, + "5": 0.144775390625, + "6": 0.12103271484375, + "7": 0.127685546875 + }, + "14": { + "0": 0.131591796875, + "1": 0.1368408203125, + "2": 0.17041015625, + "3": 0.1444091796875, + "4": 0.1495361328125, + "5": 0.1529541015625, + "6": 0.1136474609375, + "7": 0.1402587890625 + }, + "15": { + "0": 0.125244140625, + "1": 0.1512451171875, + "2": 0.1312255859375, + "3": 0.1409912109375, + "4": 0.08343505859375, + "5": 0.1497802734375, + "6": 0.13818359375, + "7": 0.11529541015625 + }, + "16": { + "0": 0.1407470703125, + "1": 0.1597900390625, + "2": 0.130859375, + "3": 0.179443359375, + "4": 0.12457275390625, + "5": 0.12359619140625, + "6": 0.11029052734375, + "7": 0.151123046875 + }, + "17": { + "0": 0.1292724609375, + "1": 0.14404296875, + "2": 0.09503173828125, + "3": 0.1339111328125, + "4": 0.11614990234375, + "5": 0.1502685546875, + "6": 0.1346435546875, + "7": 0.1380615234375 + }, + "18": { + "0": 0.120361328125, + "1": 0.115234375, + "2": 0.1256103515625, + "3": 0.131103515625, + "4": 0.1309814453125, + "5": 0.11712646484375, + "6": 0.15234375, + "7": 0.14111328125 + }, + "19": { + "0": 0.10614013671875, + "1": 0.12103271484375, + "2": 0.16357421875, + "3": 0.124267578125, + "4": 0.074951171875, + "5": 0.12445068359375, + "6": 0.14013671875, + "7": 0.11376953125 + }, + "20": { + "0": 0.1007080078125, + "1": 0.1566162109375, + "2": 0.119140625, + "3": 0.11199951171875, + "4": 0.128662109375, + "5": 0.143798828125, + "6": 0.129150390625, + "7": 0.11907958984375 + }, + "21": { + "0": 0.133544921875, + "1": 0.1058349609375, + "2": 0.1651611328125, + "3": 0.1280517578125, + "4": 0.153076171875, + "5": 0.1405029296875, + "6": 0.1685791015625, + "7": 0.11614990234375 + }, + "22": { + "0": 0.11541748046875, + "1": 0.153076171875, + "2": 0.13720703125, + "3": 0.1290283203125, + "4": 0.10162353515625, + "5": 0.1324462890625, + "6": 0.1309814453125, + "7": 0.13916015625 + }, + "23": { + "0": 0.152587890625, + "1": 0.1114501953125, + "2": 0.132080078125, + "3": 0.1256103515625, + "4": 0.1488037109375, + "5": 0.11358642578125, + "6": 0.1544189453125, + "7": 0.1456298828125 + }, + "24": { + "0": 0.140869140625, + "1": 0.1539306640625, + "2": 0.1654052734375, + "3": 0.1456298828125, + "4": 0.1060791015625, + "5": 0.1304931640625, + "6": 0.127685546875, + "7": 0.1561279296875 + }, + "25": { + "0": 0.10845947265625, + "1": 0.11505126953125, + "2": 0.141357421875, + "3": 0.1009521484375, + "4": 0.1737060546875, + "5": 0.1591796875, + "6": 0.1602783203125, + "7": 0.159423828125 + }, + "26": { + "0": 0.16748046875, + "1": 0.145263671875, + "2": 0.11077880859375, + "3": 0.11444091796875, + "4": 0.1639404296875, + "5": 0.1419677734375, + "6": 0.1510009765625, + "7": 0.132080078125 + }, + "27": { + "0": 0.157958984375, + "1": 0.11163330078125, + "2": 0.1539306640625, + "3": 0.1251220703125, + "4": 0.150146484375, + "5": 0.12481689453125, + "6": 0.162841796875, + "7": 0.12078857421875 + }, + "28": { + "0": 0.1376953125, + "1": 0.152587890625, + "2": 0.1656494140625, + "3": 0.12420654296875, + "4": 0.1448974609375, + "5": 0.14453125, + "6": 0.1187744140625, + "7": 0.139892578125 + }, + "29": { + "0": 0.1455078125, + "1": 0.1685791015625, + "2": 0.1318359375, + "3": 0.1524658203125, + "4": 0.1241455078125, + "5": 0.1201171875, + "6": 0.2783203125, + "7": 0.161865234375 + }, + "30": { + "0": 0.1141357421875, + "1": 0.147216796875, + "2": 0.146240234375, + "3": 0.138916015625, + "4": 0.11279296875, + "5": 0.12298583984375, + "6": 0.1439208984375, + "7": 0.1343994140625 + }, + "31": { + "0": 0.1688232421875, + "1": 0.111328125, + "2": 0.1407470703125, + "3": 0.1370849609375, + "4": 0.1259765625, + "5": 0.12457275390625, + "6": 0.1365966796875, + "7": 0.113525390625 + } + }, + "v_scale": { + "0": { + "0": 0.0029449462890625, + "1": 0.003971099853515625, + "2": 0.003414154052734375, + "3": 0.002643585205078125, + "4": 0.00356292724609375, + "5": 0.0025177001953125, + "6": 0.0036945343017578125, + "7": 0.0037975311279296875 + }, + "1": { + "0": 0.00598907470703125, + "1": 0.0030231475830078125, + "2": 0.01383209228515625, + "3": 0.00382232666015625, + "4": 0.005695343017578125, + "5": 0.005481719970703125, + "6": 0.00733184814453125, + "7": 0.026824951171875 + }, + "2": { + "0": 0.00884246826171875, + "1": 0.015594482421875, + "2": 0.00899505615234375, + "3": 0.0110015869140625, + "4": 0.0096282958984375, + "5": 0.01128387451171875, + "6": 0.01485443115234375, + "7": 0.0094451904296875 + }, + "3": { + "0": 0.010223388671875, + "1": 0.0134735107421875, + "2": 0.016815185546875, + "3": 0.00933837890625, + "4": 0.0190582275390625, + "5": 0.01641845703125, + "6": 0.010498046875, + "7": 0.0182647705078125 + }, + "4": { + "0": 0.0145416259765625, + "1": 0.01023101806640625, + "2": 0.0111541748046875, + "3": 0.0164794921875, + "4": 0.0128326416015625, + "5": 0.0096282958984375, + "6": 0.01556396484375, + "7": 0.01079559326171875 + }, + "5": { + "0": 0.0180511474609375, + "1": 0.0129241943359375, + "2": 0.014404296875, + "3": 0.0134429931640625, + "4": 0.0120849609375, + "5": 0.016143798828125, + "6": 0.01593017578125, + "7": 0.0184783935546875 + }, + "6": { + "0": 0.0184783935546875, + "1": 0.014923095703125, + "2": 0.01413726806640625, + "3": 0.020721435546875, + "4": 0.0176544189453125, + "5": 0.01554107666015625, + "6": 0.0222320556640625, + "7": 0.0106201171875 + }, + "7": { + "0": 0.017547607421875, + "1": 0.0212860107421875, + "2": 0.01505279541015625, + "3": 0.01265716552734375, + "4": 0.0177001953125, + "5": 0.01407623291015625, + "6": 0.01399993896484375, + "7": 0.018524169921875 + }, + "8": { + "0": 0.01288604736328125, + "1": 0.0237579345703125, + "2": 0.0158538818359375, + "3": 0.0169830322265625, + "4": 0.018341064453125, + "5": 0.015899658203125, + "6": 0.01374053955078125, + "7": 0.01422119140625 + }, + "9": { + "0": 0.01259613037109375, + "1": 0.028564453125, + "2": 0.02099609375, + "3": 0.017669677734375, + "4": 0.0165557861328125, + "5": 0.0169525146484375, + "6": 0.016845703125, + "7": 0.01788330078125 + }, + "10": { + "0": 0.017333984375, + "1": 0.0199737548828125, + "2": 0.019073486328125, + "3": 0.01421356201171875, + "4": 0.01255035400390625, + "5": 0.01314544677734375, + "6": 0.0154876708984375, + "7": 0.0135498046875 + }, + "11": { + "0": 0.0163726806640625, + "1": 0.01849365234375, + "2": 0.0210418701171875, + "3": 0.0215301513671875, + "4": 0.0157623291015625, + "5": 0.0253753662109375, + "6": 0.0229034423828125, + "7": 0.0207061767578125 + }, + "12": { + "0": 0.0173797607421875, + "1": 0.0174713134765625, + "2": 0.0201416015625, + "3": 0.0166473388671875, + "4": 0.018829345703125, + "5": 0.017242431640625, + "6": 0.0186309814453125, + "7": 0.019927978515625 + }, + "13": { + "0": 0.01763916015625, + "1": 0.01788330078125, + "2": 0.0187530517578125, + "3": 0.0142974853515625, + "4": 0.02044677734375, + "5": 0.0213165283203125, + "6": 0.0167236328125, + "7": 0.0236663818359375 + }, + "14": { + "0": 0.015411376953125, + "1": 0.0179901123046875, + "2": 0.023468017578125, + "3": 0.0163421630859375, + "4": 0.0225982666015625, + "5": 0.0175933837890625, + "6": 0.01953125, + "7": 0.0168914794921875 + }, + "15": { + "0": 0.0179595947265625, + "1": 0.0154571533203125, + "2": 0.0181121826171875, + "3": 0.016021728515625, + "4": 0.03094482421875, + "5": 0.01702880859375, + "6": 0.020233154296875, + "7": 0.019134521484375 + }, + "16": { + "0": 0.0168304443359375, + "1": 0.0130462646484375, + "2": 0.01108551025390625, + "3": 0.0203094482421875, + "4": 0.0166473388671875, + "5": 0.01837158203125, + "6": 0.01386260986328125, + "7": 0.01953125 + }, + "17": { + "0": 0.0131683349609375, + "1": 0.019073486328125, + "2": 0.0164031982421875, + "3": 0.0126953125, + "4": 0.016632080078125, + "5": 0.0156402587890625, + "6": 0.01837158203125, + "7": 0.016448974609375 + }, + "18": { + "0": 0.01332855224609375, + "1": 0.0285797119140625, + "2": 0.018890380859375, + "3": 0.0156707763671875, + "4": 0.0264892578125, + "5": 0.0235748291015625, + "6": 0.021148681640625, + "7": 0.017333984375 + }, + "19": { + "0": 0.01751708984375, + "1": 0.0239105224609375, + "2": 0.015228271484375, + "3": 0.0170440673828125, + "4": 0.01543426513671875, + "5": 0.0259857177734375, + "6": 0.016448974609375, + "7": 0.02362060546875 + }, + "20": { + "0": 0.0207977294921875, + "1": 0.0259552001953125, + "2": 0.016387939453125, + "3": 0.0167236328125, + "4": 0.0218353271484375, + "5": 0.01873779296875, + "6": 0.01788330078125, + "7": 0.0182952880859375 + }, + "21": { + "0": 0.0127716064453125, + "1": 0.0256500244140625, + "2": 0.0189971923828125, + "3": 0.018646240234375, + "4": 0.0182037353515625, + "5": 0.0226593017578125, + "6": 0.017547607421875, + "7": 0.021209716796875 + }, + "22": { + "0": 0.03143310546875, + "1": 0.0153961181640625, + "2": 0.017547607421875, + "3": 0.031890869140625, + "4": 0.0217132568359375, + "5": 0.0162506103515625, + "6": 0.016754150390625, + "7": 0.027008056640625 + }, + "23": { + "0": 0.019317626953125, + "1": 0.0276641845703125, + "2": 0.018280029296875, + "3": 0.020538330078125, + "4": 0.0169830322265625, + "5": 0.025238037109375, + "6": 0.019744873046875, + "7": 0.0238800048828125 + }, + "24": { + "0": 0.0275726318359375, + "1": 0.0201416015625, + "2": 0.0204620361328125, + "3": 0.024932861328125, + "4": 0.0305633544921875, + "5": 0.0275421142578125, + "6": 0.04351806640625, + "7": 0.0281219482421875 + }, + "25": { + "0": 0.0242919921875, + "1": 0.0316162109375, + "2": 0.019805908203125, + "3": 0.03173828125, + "4": 0.0205841064453125, + "5": 0.035491943359375, + "6": 0.0156707763671875, + "7": 0.023040771484375 + }, + "26": { + "0": 0.0223236083984375, + "1": 0.01849365234375, + "2": 0.0185089111328125, + "3": 0.034912109375, + "4": 0.0237579345703125, + "5": 0.032684326171875, + "6": 0.019073486328125, + "7": 0.025634765625 + }, + "27": { + "0": 0.0230712890625, + "1": 0.039398193359375, + "2": 0.026702880859375, + "3": 0.0207977294921875, + "4": 0.0270538330078125, + "5": 0.04412841796875, + "6": 0.0181884765625, + "7": 0.0310516357421875 + }, + "28": { + "0": 0.0229034423828125, + "1": 0.042205810546875, + "2": 0.032623291015625, + "3": 0.037261962890625, + "4": 0.0290069580078125, + "5": 0.0254058837890625, + "6": 0.0278167724609375, + "7": 0.0159759521484375 + }, + "29": { + "0": 0.028350830078125, + "1": 0.0276641845703125, + "2": 0.0266265869140625, + "3": 0.021942138671875, + "4": 0.03436279296875, + "5": 0.055694580078125, + "6": 0.048736572265625, + "7": 0.0271148681640625 + }, + "30": { + "0": 0.0262603759765625, + "1": 0.0283660888671875, + "2": 0.0297393798828125, + "3": 0.05181884765625, + "4": 0.027587890625, + "5": 0.03240966796875, + "6": 0.03375244140625, + "7": 0.046142578125 + }, + "31": { + "0": 0.0325927734375, + "1": 0.027618408203125, + "2": 0.026397705078125, + "3": 0.05096435546875, + "4": 0.0230865478515625, + "5": 0.039093017578125, + "6": 0.0313720703125, + "7": 0.01837158203125 + } + }, + "k_zero_point": { + "0": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "1": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "2": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "3": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "4": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "5": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "6": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "7": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "8": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "9": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "10": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "11": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "12": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "13": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "14": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "15": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "16": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "17": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "18": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "19": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "20": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "21": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "22": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "23": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "24": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "25": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "26": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "27": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "28": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "29": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "30": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "31": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + } + }, + "v_zero_point": { + "0": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "1": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "2": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "3": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "4": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "5": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "6": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "7": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "8": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "9": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "10": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "11": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "12": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "13": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "14": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "15": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "16": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "17": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "18": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "19": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "20": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "21": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "22": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "23": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "24": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "25": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "26": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "27": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "28": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "29": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "30": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "31": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + } + } + } + } +} \ No newline at end of file diff --git a/examples/int8/work_dir/cmb/kv_cache_scales_layer_level.json b/examples/int8/work_dir/cmb/kv_cache_scales_layer_level.json new file mode 100644 index 000000000000..32075af4c304 --- /dev/null +++ b/examples/int8/work_dir/cmb/kv_cache_scales_layer_level.json @@ -0,0 +1,400 @@ +{ + "model_type": "llama", + "kv_cache": { + "dtype": "int8", + "scaling_factor": { + "k_scale": { + "0": { + "0": 0.1224365234375 + }, + "1": { + "0": 0.11419677734375 + }, + "2": { + "0": 0.1395263671875 + }, + "3": { + "0": 0.151611328125 + }, + "4": { + "0": 0.1661376953125 + }, + "5": { + "0": 0.1510009765625 + }, + "6": { + "0": 0.145751953125 + }, + "7": { + "0": 0.16162109375 + }, + "8": { + "0": 0.1690673828125 + }, + "9": { + "0": 0.173095703125 + }, + "10": { + "0": 0.1744384765625 + }, + "11": { + "0": 0.1590576171875 + }, + "12": { + "0": 0.1650390625 + }, + "13": { + "0": 0.14111328125 + }, + "14": { + "0": 0.1553955078125 + }, + "15": { + "0": 0.14501953125 + }, + "16": { + "0": 0.1708984375 + }, + "17": { + "0": 0.1414794921875 + }, + "18": { + "0": 0.146484375 + }, + "19": { + "0": 0.157470703125 + }, + "20": { + "0": 0.1480712890625 + }, + "21": { + "0": 0.169921875 + }, + "22": { + "0": 0.1524658203125 + }, + "23": { + "0": 0.15234375 + }, + "24": { + "0": 0.157470703125 + }, + "25": { + "0": 0.164794921875 + }, + "26": { + "0": 0.160400390625 + }, + "27": { + "0": 0.1622314453125 + }, + "28": { + "0": 0.171142578125 + }, + "29": { + "0": 0.272705078125 + }, + "30": { + "0": 0.1488037109375 + }, + "31": { + "0": 0.168701171875 + } + }, + "v_scale": { + "0": { + "0": 0.003971099853515625 + }, + "1": { + "0": 0.026824951171875 + }, + "2": { + "0": 0.01546478271484375 + }, + "3": { + "0": 0.0194549560546875 + }, + "4": { + "0": 0.01535797119140625 + }, + "5": { + "0": 0.018402099609375 + }, + "6": { + "0": 0.0216064453125 + }, + "7": { + "0": 0.0212554931640625 + }, + "8": { + "0": 0.0189056396484375 + }, + "9": { + "0": 0.024444580078125 + }, + "10": { + "0": 0.0208740234375 + }, + "11": { + "0": 0.0243072509765625 + }, + "12": { + "0": 0.0198211669921875 + }, + "13": { + "0": 0.0221099853515625 + }, + "14": { + "0": 0.0225830078125 + }, + "15": { + "0": 0.03179931640625 + }, + "16": { + "0": 0.0191802978515625 + }, + "17": { + "0": 0.020111083984375 + }, + "18": { + "0": 0.0280609130859375 + }, + "19": { + "0": 0.0239105224609375 + }, + "20": { + "0": 0.0233612060546875 + }, + "21": { + "0": 0.0252838134765625 + }, + "22": { + "0": 0.037628173828125 + }, + "23": { + "0": 0.0284576416015625 + }, + "24": { + "0": 0.0430908203125 + }, + "25": { + "0": 0.034088134765625 + }, + "26": { + "0": 0.03619384765625 + }, + "27": { + "0": 0.040252685546875 + }, + "28": { + "0": 0.038116455078125 + }, + "29": { + "0": 0.054412841796875 + }, + "30": { + "0": 0.048675537109375 + }, + "31": { + "0": 0.05157470703125 + } + }, + "k_zero_point": { + "0": { + "0": 0.0 + }, + "1": { + "0": 0.0 + }, + "2": { + "0": 0.0 + }, + "3": { + "0": 0.0 + }, + "4": { + "0": 0.0 + }, + "5": { + "0": 0.0 + }, + "6": { + "0": 0.0 + }, + "7": { + "0": 0.0 + }, + "8": { + "0": 0.0 + }, + "9": { + "0": 0.0 + }, + "10": { + "0": 0.0 + }, + "11": { + "0": 0.0 + }, + "12": { + "0": 0.0 + }, + "13": { + "0": 0.0 + }, + "14": { + "0": 0.0 + }, + "15": { + "0": 0.0 + }, + "16": { + "0": 0.0 + }, + "17": { + "0": 0.0 + }, + "18": { + "0": 0.0 + }, + "19": { + "0": 0.0 + }, + "20": { + "0": 0.0 + }, + "21": { + "0": 0.0 + }, + "22": { + "0": 0.0 + }, + "23": { + "0": 0.0 + }, + "24": { + "0": 0.0 + }, + "25": { + "0": 0.0 + }, + "26": { + "0": 0.0 + }, + "27": { + "0": 0.0 + }, + "28": { + "0": 0.0 + }, + "29": { + "0": 0.0 + }, + "30": { + "0": 0.0 + }, + "31": { + "0": 0.0 + } + }, + "v_zero_point": { + "0": { + "0": 0.0 + }, + "1": { + "0": 0.0 + }, + "2": { + "0": 0.0 + }, + "3": { + "0": 0.0 + }, + "4": { + "0": 0.0 + }, + "5": { + "0": 0.0 + }, + "6": { + "0": 0.0 + }, + "7": { + "0": 0.0 + }, + "8": { + "0": 0.0 + }, + "9": { + "0": 0.0 + }, + "10": { + "0": 0.0 + }, + "11": { + "0": 0.0 + }, + "12": { + "0": 0.0 + }, + "13": { + "0": 0.0 + }, + "14": { + "0": 0.0 + }, + "15": { + "0": 0.0 + }, + "16": { + "0": 0.0 + }, + "17": { + "0": 0.0 + }, + "18": { + "0": 0.0 + }, + "19": { + "0": 0.0 + }, + "20": { + "0": 0.0 + }, + "21": { + "0": 0.0 + }, + "22": { + "0": 0.0 + }, + "23": { + "0": 0.0 + }, + "24": { + "0": 0.0 + }, + "25": { + "0": 0.0 + }, + "26": { + "0": 0.0 + }, + "27": { + "0": 0.0 + }, + "28": { + "0": 0.0 + }, + "29": { + "0": 0.0 + }, + "30": { + "0": 0.0 + }, + "31": { + "0": 0.0 + } + } + } + } +} \ No newline at end of file diff --git a/examples/int8/work_dir/cmb/kv_cache_scales_quant_group128.json b/examples/int8/work_dir/cmb/kv_cache_scales_quant_group128.json new file mode 100644 index 000000000000..c1e2ef8226a4 --- /dev/null +++ b/examples/int8/work_dir/cmb/kv_cache_scales_quant_group128.json @@ -0,0 +1,1296 @@ +{ + "model_type": "llama", + "kv_cache": { + "dtype": "int8", + "scaling_factor": { + "k_scale": { + "0": { + "0": 0.09539794921875, + "1": 0.07574462890625, + "2": 0.08447265625, + "3": 0.05828857421875, + "4": 0.07049560546875, + "5": 0.064453125, + "6": 0.1224365234375, + "7": 0.050506591796875 + }, + "1": { + "0": 0.08990478515625, + "1": 0.11419677734375, + "2": 0.11126708984375, + "3": 0.076904296875, + "4": 0.1024169921875, + "5": 0.07720947265625, + "6": 0.0875244140625, + "7": 0.08721923828125 + }, + "2": { + "0": 0.12384033203125, + "1": 0.1395263671875, + "2": 0.1009521484375, + "3": 0.1298828125, + "4": 0.10211181640625, + "5": 0.1241455078125, + "6": 0.13916015625, + "7": 0.126220703125 + }, + "3": { + "0": 0.1231689453125, + "1": 0.0921630859375, + "2": 0.151611328125, + "3": 0.12548828125, + "4": 0.130615234375, + "5": 0.1231689453125, + "6": 0.11712646484375, + "7": 0.151611328125 + }, + "4": { + "0": 0.07525634765625, + "1": 0.130615234375, + "2": 0.1279296875, + "3": 0.10662841796875, + "4": 0.1661376953125, + "5": 0.10986328125, + "6": 0.12139892578125, + "7": 0.12408447265625 + }, + "5": { + "0": 0.10284423828125, + "1": 0.12030029296875, + "2": 0.11773681640625, + "3": 0.11077880859375, + "4": 0.104736328125, + "5": 0.1102294921875, + "6": 0.1510009765625, + "7": 0.108642578125 + }, + "6": { + "0": 0.12158203125, + "1": 0.144775390625, + "2": 0.135009765625, + "3": 0.0972900390625, + "4": 0.145751953125, + "5": 0.1357421875, + "6": 0.1134033203125, + "7": 0.1434326171875 + }, + "7": { + "0": 0.107421875, + "1": 0.1568603515625, + "2": 0.126708984375, + "3": 0.1456298828125, + "4": 0.14794921875, + "5": 0.1197509765625, + "6": 0.113037109375, + "7": 0.16162109375 + }, + "8": { + "0": 0.11407470703125, + "1": 0.11376953125, + "2": 0.1690673828125, + "3": 0.1531982421875, + "4": 0.1441650390625, + "5": 0.1268310546875, + "6": 0.11651611328125, + "7": 0.12078857421875 + }, + "9": { + "0": 0.1363525390625, + "1": 0.1158447265625, + "2": 0.173095703125, + "3": 0.12225341796875, + "4": 0.130126953125, + "5": 0.1229248046875, + "6": 0.10089111328125, + "7": 0.11572265625 + }, + "10": { + "0": 0.1744384765625, + "1": 0.12139892578125, + "2": 0.10113525390625, + "3": 0.10235595703125, + "4": 0.1148681640625, + "5": 0.1644287109375, + "6": 0.1324462890625, + "7": 0.14697265625 + }, + "11": { + "0": 0.1417236328125, + "1": 0.12298583984375, + "2": 0.136962890625, + "3": 0.127685546875, + "4": 0.11993408203125, + "5": 0.11761474609375, + "6": 0.1590576171875, + "7": 0.11578369140625 + }, + "12": { + "0": 0.1387939453125, + "1": 0.144287109375, + "2": 0.1650390625, + "3": 0.11029052734375, + "4": 0.1439208984375, + "5": 0.156494140625, + "6": 0.1536865234375, + "7": 0.15185546875 + }, + "13": { + "0": 0.12939453125, + "1": 0.140380859375, + "2": 0.1280517578125, + "3": 0.12347412109375, + "4": 0.14111328125, + "5": 0.128662109375, + "6": 0.122802734375, + "7": 0.1265869140625 + }, + "14": { + "0": 0.13671875, + "1": 0.1290283203125, + "2": 0.1553955078125, + "3": 0.143310546875, + "4": 0.145751953125, + "5": 0.1436767578125, + "6": 0.1107177734375, + "7": 0.1368408203125 + }, + "15": { + "0": 0.1302490234375, + "1": 0.1446533203125, + "2": 0.133056640625, + "3": 0.1395263671875, + "4": 0.08758544921875, + "5": 0.14501953125, + "6": 0.13818359375, + "7": 0.11248779296875 + }, + "16": { + "0": 0.140380859375, + "1": 0.156494140625, + "2": 0.1343994140625, + "3": 0.1708984375, + "4": 0.12103271484375, + "5": 0.1162109375, + "6": 0.11578369140625, + "7": 0.14208984375 + }, + "17": { + "0": 0.129150390625, + "1": 0.1409912109375, + "2": 0.09271240234375, + "3": 0.129638671875, + "4": 0.11456298828125, + "5": 0.138427734375, + "6": 0.1273193359375, + "7": 0.1414794921875 + }, + "18": { + "0": 0.1199951171875, + "1": 0.1123046875, + "2": 0.12445068359375, + "3": 0.12481689453125, + "4": 0.1295166015625, + "5": 0.12225341796875, + "6": 0.146484375, + "7": 0.14111328125 + }, + "19": { + "0": 0.10589599609375, + "1": 0.11688232421875, + "2": 0.157470703125, + "3": 0.11749267578125, + "4": 0.0738525390625, + "5": 0.10626220703125, + "6": 0.1317138671875, + "7": 0.1116943359375 + }, + "20": { + "0": 0.10308837890625, + "1": 0.1480712890625, + "2": 0.115966796875, + "3": 0.10980224609375, + "4": 0.1256103515625, + "5": 0.1343994140625, + "6": 0.12054443359375, + "7": 0.1119384765625 + }, + "21": { + "0": 0.12445068359375, + "1": 0.10028076171875, + "2": 0.167236328125, + "3": 0.130126953125, + "4": 0.1519775390625, + "5": 0.1119384765625, + "6": 0.169921875, + "7": 0.11376953125 + }, + "22": { + "0": 0.1168212890625, + "1": 0.1524658203125, + "2": 0.1380615234375, + "3": 0.126953125, + "4": 0.10223388671875, + "5": 0.1298828125, + "6": 0.131103515625, + "7": 0.1259765625 + }, + "23": { + "0": 0.15234375, + "1": 0.1060791015625, + "2": 0.1307373046875, + "3": 0.1175537109375, + "4": 0.1448974609375, + "5": 0.11492919921875, + "6": 0.1513671875, + "7": 0.14404296875 + }, + "24": { + "0": 0.1351318359375, + "1": 0.151611328125, + "2": 0.157470703125, + "3": 0.145751953125, + "4": 0.10260009765625, + "5": 0.123291015625, + "6": 0.1160888671875, + "7": 0.11199951171875 + }, + "25": { + "0": 0.09881591796875, + "1": 0.10888671875, + "2": 0.129150390625, + "3": 0.09881591796875, + "4": 0.164794921875, + "5": 0.156494140625, + "6": 0.15966796875, + "7": 0.1580810546875 + }, + "26": { + "0": 0.160400390625, + "1": 0.1417236328125, + "2": 0.10760498046875, + "3": 0.106689453125, + "4": 0.15087890625, + "5": 0.1444091796875, + "6": 0.1453857421875, + "7": 0.12939453125 + }, + "27": { + "0": 0.151123046875, + "1": 0.1102294921875, + "2": 0.154296875, + "3": 0.1273193359375, + "4": 0.143310546875, + "5": 0.1180419921875, + "6": 0.1622314453125, + "7": 0.11785888671875 + }, + "28": { + "0": 0.1334228515625, + "1": 0.148681640625, + "2": 0.171142578125, + "3": 0.1214599609375, + "4": 0.1357421875, + "5": 0.1357421875, + "6": 0.11932373046875, + "7": 0.1416015625 + }, + "29": { + "0": 0.1507568359375, + "1": 0.158447265625, + "2": 0.1226806640625, + "3": 0.1500244140625, + "4": 0.12030029296875, + "5": 0.102783203125, + "6": 0.272705078125, + "7": 0.1529541015625 + }, + "30": { + "0": 0.11053466796875, + "1": 0.1488037109375, + "2": 0.133056640625, + "3": 0.1300048828125, + "4": 0.10980224609375, + "5": 0.11968994140625, + "6": 0.141845703125, + "7": 0.1331787109375 + }, + "31": { + "0": 0.168701171875, + "1": 0.11505126953125, + "2": 0.1402587890625, + "3": 0.1368408203125, + "4": 0.11810302734375, + "5": 0.11846923828125, + "6": 0.1339111328125, + "7": 0.10809326171875 + } + }, + "v_scale": { + "0": { + "0": 0.002925872802734375, + "1": 0.003971099853515625, + "2": 0.003414154052734375, + "3": 0.0017271041870117188, + "4": 0.00356292724609375, + "5": 0.00251007080078125, + "6": 0.0032405853271484375, + "7": 0.003887176513671875 + }, + "1": { + "0": 0.006992340087890625, + "1": 0.002971649169921875, + "2": 0.01148223876953125, + "3": 0.004138946533203125, + "4": 0.00537109375, + "5": 0.00518798828125, + "6": 0.007251739501953125, + "7": 0.026824951171875 + }, + "2": { + "0": 0.00885009765625, + "1": 0.01313018798828125, + "2": 0.01546478271484375, + "3": 0.0102386474609375, + "4": 0.0098419189453125, + "5": 0.010833740234375, + "6": 0.01468658447265625, + "7": 0.00933074951171875 + }, + "3": { + "0": 0.01007843017578125, + "1": 0.013824462890625, + "2": 0.01544189453125, + "3": 0.00946807861328125, + "4": 0.0194549560546875, + "5": 0.0162506103515625, + "6": 0.00958251953125, + "7": 0.0167388916015625 + }, + "4": { + "0": 0.01375579833984375, + "1": 0.01116943359375, + "2": 0.01151275634765625, + "3": 0.01535797119140625, + "4": 0.01123809814453125, + "5": 0.01004791259765625, + "6": 0.01288604736328125, + "7": 0.01110076904296875 + }, + "5": { + "0": 0.018402099609375, + "1": 0.01236724853515625, + "2": 0.0157470703125, + "3": 0.01296234130859375, + "4": 0.01178741455078125, + "5": 0.01396942138671875, + "6": 0.01474761962890625, + "7": 0.0180816650390625 + }, + "6": { + "0": 0.0172119140625, + "1": 0.0138702392578125, + "2": 0.01418304443359375, + "3": 0.0208740234375, + "4": 0.0204925537109375, + "5": 0.01495361328125, + "6": 0.0216064453125, + "7": 0.01105499267578125 + }, + "7": { + "0": 0.01763916015625, + "1": 0.0212554931640625, + "2": 0.0141143798828125, + "3": 0.01204681396484375, + "4": 0.0177001953125, + "5": 0.01216888427734375, + "6": 0.0146331787109375, + "7": 0.01898193359375 + }, + "8": { + "0": 0.013641357421875, + "1": 0.017913818359375, + "2": 0.016204833984375, + "3": 0.0189056396484375, + "4": 0.0183868408203125, + "5": 0.0153350830078125, + "6": 0.0154876708984375, + "7": 0.0142822265625 + }, + "9": { + "0": 0.01342010498046875, + "1": 0.024444580078125, + "2": 0.021087646484375, + "3": 0.0180816650390625, + "4": 0.016815185546875, + "5": 0.0185089111328125, + "6": 0.0168304443359375, + "7": 0.0193634033203125 + }, + "10": { + "0": 0.0208282470703125, + "1": 0.0208740234375, + "2": 0.0194091796875, + "3": 0.0137176513671875, + "4": 0.01303863525390625, + "5": 0.01340484619140625, + "6": 0.01287078857421875, + "7": 0.01309967041015625 + }, + "11": { + "0": 0.0163421630859375, + "1": 0.01519012451171875, + "2": 0.0217132568359375, + "3": 0.0184326171875, + "4": 0.0145721435546875, + "5": 0.0243072509765625, + "6": 0.023590087890625, + "7": 0.0160980224609375 + }, + "12": { + "0": 0.0170745849609375, + "1": 0.01496124267578125, + "2": 0.018890380859375, + "3": 0.0162811279296875, + "4": 0.0198211669921875, + "5": 0.0167694091796875, + "6": 0.017578125, + "7": 0.0183563232421875 + }, + "13": { + "0": 0.0161285400390625, + "1": 0.0190887451171875, + "2": 0.0194244384765625, + "3": 0.01477813720703125, + "4": 0.0221099853515625, + "5": 0.0187835693359375, + "6": 0.0177001953125, + "7": 0.02008056640625 + }, + "14": { + "0": 0.0138397216796875, + "1": 0.0162811279296875, + "2": 0.0225830078125, + "3": 0.0163726806640625, + "4": 0.01983642578125, + "5": 0.0180816650390625, + "6": 0.021575927734375, + "7": 0.0174560546875 + }, + "15": { + "0": 0.0178680419921875, + "1": 0.01474761962890625, + "2": 0.0169677734375, + "3": 0.01535797119140625, + "4": 0.03179931640625, + "5": 0.0172119140625, + "6": 0.0189208984375, + "7": 0.0189361572265625 + }, + "16": { + "0": 0.017303466796875, + "1": 0.012420654296875, + "2": 0.0125274658203125, + "3": 0.0181884765625, + "4": 0.0167236328125, + "5": 0.0177001953125, + "6": 0.0155792236328125, + "7": 0.0191802978515625 + }, + "17": { + "0": 0.01477813720703125, + "1": 0.0138092041015625, + "2": 0.017059326171875, + "3": 0.0114593505859375, + "4": 0.0158538818359375, + "5": 0.0179901123046875, + "6": 0.020111083984375, + "7": 0.016845703125 + }, + "18": { + "0": 0.01213836669921875, + "1": 0.027191162109375, + "2": 0.0191802978515625, + "3": 0.014556884765625, + "4": 0.0249176025390625, + "5": 0.0280609130859375, + "6": 0.018341064453125, + "7": 0.0171051025390625 + }, + "19": { + "0": 0.0163421630859375, + "1": 0.0239105224609375, + "2": 0.01462554931640625, + "3": 0.0195465087890625, + "4": 0.01456451416015625, + "5": 0.02337646484375, + "6": 0.0155487060546875, + "7": 0.0239105224609375 + }, + "20": { + "0": 0.02044677734375, + "1": 0.0233612060546875, + "2": 0.0160675048828125, + "3": 0.0157470703125, + "4": 0.0223388671875, + "5": 0.017822265625, + "6": 0.017913818359375, + "7": 0.016815185546875 + }, + "21": { + "0": 0.012847900390625, + "1": 0.0252838134765625, + "2": 0.019866943359375, + "3": 0.017303466796875, + "4": 0.0186004638671875, + "5": 0.0236663818359375, + "6": 0.0181884765625, + "7": 0.0218505859375 + }, + "22": { + "0": 0.029144287109375, + "1": 0.0157012939453125, + "2": 0.01837158203125, + "3": 0.037628173828125, + "4": 0.0206451416015625, + "5": 0.0171966552734375, + "6": 0.017425537109375, + "7": 0.033843994140625 + }, + "23": { + "0": 0.01934814453125, + "1": 0.0284576416015625, + "2": 0.017181396484375, + "3": 0.0208740234375, + "4": 0.0167236328125, + "5": 0.0250701904296875, + "6": 0.0181121826171875, + "7": 0.024566650390625 + }, + "24": { + "0": 0.0234832763671875, + "1": 0.019287109375, + "2": 0.0214080810546875, + "3": 0.0236663818359375, + "4": 0.032745361328125, + "5": 0.0278167724609375, + "6": 0.0430908203125, + "7": 0.027496337890625 + }, + "25": { + "0": 0.0220184326171875, + "1": 0.02972412109375, + "2": 0.0201873779296875, + "3": 0.033782958984375, + "4": 0.02197265625, + "5": 0.034088134765625, + "6": 0.0152587890625, + "7": 0.0228271484375 + }, + "26": { + "0": 0.0218963623046875, + "1": 0.0198974609375, + "2": 0.0172576904296875, + "3": 0.03619384765625, + "4": 0.020538330078125, + "5": 0.0268096923828125, + "6": 0.0190277099609375, + "7": 0.0251312255859375 + }, + "27": { + "0": 0.023101806640625, + "1": 0.040252685546875, + "2": 0.0290069580078125, + "3": 0.016876220703125, + "4": 0.02703857421875, + "5": 0.039703369140625, + "6": 0.0191650390625, + "7": 0.0290679931640625 + }, + "28": { + "0": 0.0217132568359375, + "1": 0.037139892578125, + "2": 0.030853271484375, + "3": 0.038116455078125, + "4": 0.034454345703125, + "5": 0.0246734619140625, + "6": 0.0281219482421875, + "7": 0.016937255859375 + }, + "29": { + "0": 0.0229949951171875, + "1": 0.02703857421875, + "2": 0.0237274169921875, + "3": 0.0221710205078125, + "4": 0.034515380859375, + "5": 0.054412841796875, + "6": 0.045135498046875, + "7": 0.029449462890625 + }, + "30": { + "0": 0.0215606689453125, + "1": 0.0253143310546875, + "2": 0.0281982421875, + "3": 0.048675537109375, + "4": 0.0243072509765625, + "5": 0.028045654296875, + "6": 0.0406494140625, + "7": 0.0426025390625 + }, + "31": { + "0": 0.0318603515625, + "1": 0.027374267578125, + "2": 0.02520751953125, + "3": 0.05157470703125, + "4": 0.025238037109375, + "5": 0.04180908203125, + "6": 0.0281829833984375, + "7": 0.01885986328125 + } + }, + "k_zero_point": { + "0": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "1": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "2": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "3": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "4": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "5": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "6": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "7": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "8": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "9": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "10": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "11": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "12": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "13": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "14": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "15": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "16": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "17": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "18": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "19": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "20": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "21": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "22": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "23": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "24": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "25": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "26": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "27": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "28": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "29": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "30": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "31": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + } + }, + "v_zero_point": { + "0": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "1": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "2": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "3": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "4": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "5": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "6": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "7": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "8": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "9": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "10": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "11": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "12": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "13": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "14": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "15": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "16": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "17": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "18": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "19": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "20": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "21": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "22": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "23": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "24": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "25": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "26": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "27": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "28": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "29": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "30": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "31": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + } + } + } + } +} \ No newline at end of file diff --git a/examples/int8/work_dir/cmmlu/kv_cache_scales_layer_level.json b/examples/int8/work_dir/cmmlu/kv_cache_scales_layer_level.json new file mode 100644 index 000000000000..d290015a0879 --- /dev/null +++ b/examples/int8/work_dir/cmmlu/kv_cache_scales_layer_level.json @@ -0,0 +1,400 @@ +{ + "model_type": "llama", + "kv_cache": { + "dtype": "int8", + "scaling_factor": { + "k_scale": { + "0": { + "0": 0.129150390625 + }, + "1": { + "0": 0.1142578125 + }, + "2": { + "0": 0.1436767578125 + }, + "3": { + "0": 0.1578369140625 + }, + "4": { + "0": 0.1759033203125 + }, + "5": { + "0": 0.154296875 + }, + "6": { + "0": 0.1495361328125 + }, + "7": { + "0": 0.1649169921875 + }, + "8": { + "0": 0.16845703125 + }, + "9": { + "0": 0.1705322265625 + }, + "10": { + "0": 0.1656494140625 + }, + "11": { + "0": 0.16943359375 + }, + "12": { + "0": 0.164794921875 + }, + "13": { + "0": 0.1494140625 + }, + "14": { + "0": 0.156982421875 + }, + "15": { + "0": 0.1583251953125 + }, + "16": { + "0": 0.176025390625 + }, + "17": { + "0": 0.14404296875 + }, + "18": { + "0": 0.1514892578125 + }, + "19": { + "0": 0.1611328125 + }, + "20": { + "0": 0.167724609375 + }, + "21": { + "0": 0.1746826171875 + }, + "22": { + "0": 0.1671142578125 + }, + "23": { + "0": 0.1583251953125 + }, + "24": { + "0": 0.1651611328125 + }, + "25": { + "0": 0.1749267578125 + }, + "26": { + "0": 0.1690673828125 + }, + "27": { + "0": 0.166259765625 + }, + "28": { + "0": 0.171875 + }, + "29": { + "0": 0.274658203125 + }, + "30": { + "0": 0.154296875 + }, + "31": { + "0": 0.1710205078125 + } + }, + "v_scale": { + "0": { + "0": 0.0043487548828125 + }, + "1": { + "0": 0.026824951171875 + }, + "2": { + "0": 0.0152435302734375 + }, + "3": { + "0": 0.0201873779296875 + }, + "4": { + "0": 0.0155487060546875 + }, + "5": { + "0": 0.018310546875 + }, + "6": { + "0": 0.026275634765625 + }, + "7": { + "0": 0.021453857421875 + }, + "8": { + "0": 0.0244903564453125 + }, + "9": { + "0": 0.027191162109375 + }, + "10": { + "0": 0.0209503173828125 + }, + "11": { + "0": 0.0244903564453125 + }, + "12": { + "0": 0.0201873779296875 + }, + "13": { + "0": 0.026458740234375 + }, + "14": { + "0": 0.022308349609375 + }, + "15": { + "0": 0.03277587890625 + }, + "16": { + "0": 0.024322509765625 + }, + "17": { + "0": 0.019134521484375 + }, + "18": { + "0": 0.028045654296875 + }, + "19": { + "0": 0.028289794921875 + }, + "20": { + "0": 0.0242919921875 + }, + "21": { + "0": 0.0261688232421875 + }, + "22": { + "0": 0.034423828125 + }, + "23": { + "0": 0.02978515625 + }, + "24": { + "0": 0.0433349609375 + }, + "25": { + "0": 0.034942626953125 + }, + "26": { + "0": 0.037017822265625 + }, + "27": { + "0": 0.04461669921875 + }, + "28": { + "0": 0.04510498046875 + }, + "29": { + "0": 0.056243896484375 + }, + "30": { + "0": 0.0516357421875 + }, + "31": { + "0": 0.051513671875 + } + }, + "k_zero_point": { + "0": { + "0": 0.0 + }, + "1": { + "0": 0.0 + }, + "2": { + "0": 0.0 + }, + "3": { + "0": 0.0 + }, + "4": { + "0": 0.0 + }, + "5": { + "0": 0.0 + }, + "6": { + "0": 0.0 + }, + "7": { + "0": 0.0 + }, + "8": { + "0": 0.0 + }, + "9": { + "0": 0.0 + }, + "10": { + "0": 0.0 + }, + "11": { + "0": 0.0 + }, + "12": { + "0": 0.0 + }, + "13": { + "0": 0.0 + }, + "14": { + "0": 0.0 + }, + "15": { + "0": 0.0 + }, + "16": { + "0": 0.0 + }, + "17": { + "0": 0.0 + }, + "18": { + "0": 0.0 + }, + "19": { + "0": 0.0 + }, + "20": { + "0": 0.0 + }, + "21": { + "0": 0.0 + }, + "22": { + "0": 0.0 + }, + "23": { + "0": 0.0 + }, + "24": { + "0": 0.0 + }, + "25": { + "0": 0.0 + }, + "26": { + "0": 0.0 + }, + "27": { + "0": 0.0 + }, + "28": { + "0": 0.0 + }, + "29": { + "0": 0.0 + }, + "30": { + "0": 0.0 + }, + "31": { + "0": 0.0 + } + }, + "v_zero_point": { + "0": { + "0": 0.0 + }, + "1": { + "0": 0.0 + }, + "2": { + "0": 0.0 + }, + "3": { + "0": 0.0 + }, + "4": { + "0": 0.0 + }, + "5": { + "0": 0.0 + }, + "6": { + "0": 0.0 + }, + "7": { + "0": 0.0 + }, + "8": { + "0": 0.0 + }, + "9": { + "0": 0.0 + }, + "10": { + "0": 0.0 + }, + "11": { + "0": 0.0 + }, + "12": { + "0": 0.0 + }, + "13": { + "0": 0.0 + }, + "14": { + "0": 0.0 + }, + "15": { + "0": 0.0 + }, + "16": { + "0": 0.0 + }, + "17": { + "0": 0.0 + }, + "18": { + "0": 0.0 + }, + "19": { + "0": 0.0 + }, + "20": { + "0": 0.0 + }, + "21": { + "0": 0.0 + }, + "22": { + "0": 0.0 + }, + "23": { + "0": 0.0 + }, + "24": { + "0": 0.0 + }, + "25": { + "0": 0.0 + }, + "26": { + "0": 0.0 + }, + "27": { + "0": 0.0 + }, + "28": { + "0": 0.0 + }, + "29": { + "0": 0.0 + }, + "30": { + "0": 0.0 + }, + "31": { + "0": 0.0 + } + } + } + } +} \ No newline at end of file diff --git a/examples/int8/work_dir/cmmlu/kv_cache_scales_quant_group128.json b/examples/int8/work_dir/cmmlu/kv_cache_scales_quant_group128.json new file mode 100644 index 000000000000..bc48120494ad --- /dev/null +++ b/examples/int8/work_dir/cmmlu/kv_cache_scales_quant_group128.json @@ -0,0 +1,1296 @@ +{ + "model_type": "llama", + "kv_cache": { + "dtype": "int8", + "scaling_factor": { + "k_scale": { + "0": { + "0": 0.09710693359375, + "1": 0.0758056640625, + "2": 0.08447265625, + "3": 0.060791015625, + "4": 0.07232666015625, + "5": 0.061859130859375, + "6": 0.129150390625, + "7": 0.050537109375 + }, + "1": { + "0": 0.0867919921875, + "1": 0.1142578125, + "2": 0.11248779296875, + "3": 0.07928466796875, + "4": 0.10260009765625, + "5": 0.07891845703125, + "6": 0.08843994140625, + "7": 0.088134765625 + }, + "2": { + "0": 0.1234130859375, + "1": 0.1436767578125, + "2": 0.10101318359375, + "3": 0.136474609375, + "4": 0.1077880859375, + "5": 0.12548828125, + "6": 0.14208984375, + "7": 0.1260986328125 + }, + "3": { + "0": 0.130615234375, + "1": 0.0906982421875, + "2": 0.1544189453125, + "3": 0.1322021484375, + "4": 0.1292724609375, + "5": 0.11865234375, + "6": 0.12115478515625, + "7": 0.1578369140625 + }, + "4": { + "0": 0.0823974609375, + "1": 0.13427734375, + "2": 0.1304931640625, + "3": 0.107177734375, + "4": 0.1759033203125, + "5": 0.1121826171875, + "6": 0.1224365234375, + "7": 0.1279296875 + }, + "5": { + "0": 0.10528564453125, + "1": 0.1143798828125, + "2": 0.12091064453125, + "3": 0.1102294921875, + "4": 0.10528564453125, + "5": 0.11883544921875, + "6": 0.154296875, + "7": 0.10833740234375 + }, + "6": { + "0": 0.1243896484375, + "1": 0.1436767578125, + "2": 0.1385498046875, + "3": 0.097412109375, + "4": 0.145751953125, + "5": 0.1341552734375, + "6": 0.1138916015625, + "7": 0.1495361328125 + }, + "7": { + "0": 0.10784912109375, + "1": 0.158935546875, + "2": 0.13037109375, + "3": 0.150146484375, + "4": 0.144775390625, + "5": 0.1282958984375, + "6": 0.12060546875, + "7": 0.1649169921875 + }, + "8": { + "0": 0.12249755859375, + "1": 0.11175537109375, + "2": 0.16845703125, + "3": 0.15673828125, + "4": 0.15185546875, + "5": 0.1287841796875, + "6": 0.12408447265625, + "7": 0.1378173828125 + }, + "9": { + "0": 0.138916015625, + "1": 0.12432861328125, + "2": 0.1705322265625, + "3": 0.12744140625, + "4": 0.1375732421875, + "5": 0.1251220703125, + "6": 0.10479736328125, + "7": 0.11761474609375 + }, + "10": { + "0": 0.160400390625, + "1": 0.1297607421875, + "2": 0.109619140625, + "3": 0.11236572265625, + "4": 0.1092529296875, + "5": 0.1656494140625, + "6": 0.1322021484375, + "7": 0.148193359375 + }, + "11": { + "0": 0.1427001953125, + "1": 0.1295166015625, + "2": 0.1356201171875, + "3": 0.131591796875, + "4": 0.12493896484375, + "5": 0.1319580078125, + "6": 0.16943359375, + "7": 0.11932373046875 + }, + "12": { + "0": 0.14013671875, + "1": 0.1519775390625, + "2": 0.16259765625, + "3": 0.11212158203125, + "4": 0.1510009765625, + "5": 0.164794921875, + "6": 0.153564453125, + "7": 0.15380859375 + }, + "13": { + "0": 0.130615234375, + "1": 0.1488037109375, + "2": 0.144775390625, + "3": 0.13134765625, + "4": 0.140869140625, + "5": 0.1494140625, + "6": 0.1221923828125, + "7": 0.12646484375 + }, + "14": { + "0": 0.1373291015625, + "1": 0.128173828125, + "2": 0.1529541015625, + "3": 0.156982421875, + "4": 0.1513671875, + "5": 0.142333984375, + "6": 0.1142578125, + "7": 0.139404296875 + }, + "15": { + "0": 0.131591796875, + "1": 0.1519775390625, + "2": 0.1314697265625, + "3": 0.1405029296875, + "4": 0.083740234375, + "5": 0.1583251953125, + "6": 0.142578125, + "7": 0.115478515625 + }, + "16": { + "0": 0.151123046875, + "1": 0.16748046875, + "2": 0.13525390625, + "3": 0.176025390625, + "4": 0.1263427734375, + "5": 0.12164306640625, + "6": 0.1146240234375, + "7": 0.1468505859375 + }, + "17": { + "0": 0.13330078125, + "1": 0.14404296875, + "2": 0.09356689453125, + "3": 0.13134765625, + "4": 0.11395263671875, + "5": 0.1414794921875, + "6": 0.1258544921875, + "7": 0.1365966796875 + }, + "18": { + "0": 0.1221923828125, + "1": 0.117919921875, + "2": 0.1260986328125, + "3": 0.1317138671875, + "4": 0.133056640625, + "5": 0.1214599609375, + "6": 0.1514892578125, + "7": 0.142822265625 + }, + "19": { + "0": 0.10406494140625, + "1": 0.1207275390625, + "2": 0.1611328125, + "3": 0.1236572265625, + "4": 0.07635498046875, + "5": 0.11181640625, + "6": 0.1402587890625, + "7": 0.114990234375 + }, + "20": { + "0": 0.09686279296875, + "1": 0.167724609375, + "2": 0.12152099609375, + "3": 0.115478515625, + "4": 0.1357421875, + "5": 0.148681640625, + "6": 0.12841796875, + "7": 0.1297607421875 + }, + "21": { + "0": 0.133056640625, + "1": 0.103515625, + "2": 0.1688232421875, + "3": 0.130615234375, + "4": 0.156494140625, + "5": 0.141845703125, + "6": 0.1746826171875, + "7": 0.1171875 + }, + "22": { + "0": 0.1239013671875, + "1": 0.1671142578125, + "2": 0.1405029296875, + "3": 0.12139892578125, + "4": 0.10504150390625, + "5": 0.13525390625, + "6": 0.1365966796875, + "7": 0.137939453125 + }, + "23": { + "0": 0.1583251953125, + "1": 0.106689453125, + "2": 0.135009765625, + "3": 0.12396240234375, + "4": 0.1519775390625, + "5": 0.1124267578125, + "6": 0.157470703125, + "7": 0.150634765625 + }, + "24": { + "0": 0.1390380859375, + "1": 0.1650390625, + "2": 0.1651611328125, + "3": 0.14892578125, + "4": 0.10906982421875, + "5": 0.1329345703125, + "6": 0.1268310546875, + "7": 0.125244140625 + }, + "25": { + "0": 0.10833740234375, + "1": 0.11297607421875, + "2": 0.1324462890625, + "3": 0.10107421875, + "4": 0.1749267578125, + "5": 0.1748046875, + "6": 0.1683349609375, + "7": 0.160400390625 + }, + "26": { + "0": 0.1690673828125, + "1": 0.1575927734375, + "2": 0.1141357421875, + "3": 0.110595703125, + "4": 0.1622314453125, + "5": 0.146484375, + "6": 0.14892578125, + "7": 0.1351318359375 + }, + "27": { + "0": 0.15283203125, + "1": 0.109375, + "2": 0.1556396484375, + "3": 0.1292724609375, + "4": 0.15576171875, + "5": 0.1287841796875, + "6": 0.166259765625, + "7": 0.1165771484375 + }, + "28": { + "0": 0.1427001953125, + "1": 0.158203125, + "2": 0.171875, + "3": 0.122802734375, + "4": 0.1405029296875, + "5": 0.1365966796875, + "6": 0.11749267578125, + "7": 0.142578125 + }, + "29": { + "0": 0.150634765625, + "1": 0.163818359375, + "2": 0.1285400390625, + "3": 0.150390625, + "4": 0.12396240234375, + "5": 0.11358642578125, + "6": 0.274658203125, + "7": 0.1580810546875 + }, + "30": { + "0": 0.11138916015625, + "1": 0.154296875, + "2": 0.1444091796875, + "3": 0.1414794921875, + "4": 0.11236572265625, + "5": 0.12493896484375, + "6": 0.154052734375, + "7": 0.1358642578125 + }, + "31": { + "0": 0.1710205078125, + "1": 0.11517333984375, + "2": 0.140869140625, + "3": 0.1385498046875, + "4": 0.125244140625, + "5": 0.125244140625, + "6": 0.1353759765625, + "7": 0.114501953125 + } + }, + "v_scale": { + "0": { + "0": 0.00298309326171875, + "1": 0.003971099853515625, + "2": 0.003414154052734375, + "3": 0.002643585205078125, + "4": 0.0043487548828125, + "5": 0.002460479736328125, + "6": 0.003627777099609375, + "7": 0.003814697265625 + }, + "1": { + "0": 0.006793975830078125, + "1": 0.0030689239501953125, + "2": 0.014373779296875, + "3": 0.004055023193359375, + "4": 0.00592041015625, + "5": 0.005184173583984375, + "6": 0.007373809814453125, + "7": 0.026824951171875 + }, + "2": { + "0": 0.0087738037109375, + "1": 0.01396942138671875, + "2": 0.00943756103515625, + "3": 0.01032257080078125, + "4": 0.0102691650390625, + "5": 0.01082611083984375, + "6": 0.0152435302734375, + "7": 0.0101165771484375 + }, + "3": { + "0": 0.01079559326171875, + "1": 0.01373291015625, + "2": 0.0156707763671875, + "3": 0.0095062255859375, + "4": 0.0201873779296875, + "5": 0.0190277099609375, + "6": 0.0125732421875, + "7": 0.0189056396484375 + }, + "4": { + "0": 0.015228271484375, + "1": 0.0102081298828125, + "2": 0.01192474365234375, + "3": 0.0155487060546875, + "4": 0.01190948486328125, + "5": 0.0099639892578125, + "6": 0.01546478271484375, + "7": 0.011322021484375 + }, + "5": { + "0": 0.018310546875, + "1": 0.01345062255859375, + "2": 0.01708984375, + "3": 0.0132598876953125, + "4": 0.01235198974609375, + "5": 0.01611328125, + "6": 0.01474761962890625, + "7": 0.0180206298828125 + }, + "6": { + "0": 0.0174102783203125, + "1": 0.016510009765625, + "2": 0.0149993896484375, + "3": 0.022216796875, + "4": 0.0195159912109375, + "5": 0.015899658203125, + "6": 0.026275634765625, + "7": 0.01067352294921875 + }, + "7": { + "0": 0.0176239013671875, + "1": 0.021453857421875, + "2": 0.0161590576171875, + "3": 0.0133819580078125, + "4": 0.018218994140625, + "5": 0.0126190185546875, + "6": 0.015167236328125, + "7": 0.0189361572265625 + }, + "8": { + "0": 0.01273345947265625, + "1": 0.0244903564453125, + "2": 0.0146942138671875, + "3": 0.0171966552734375, + "4": 0.0211944580078125, + "5": 0.0153350830078125, + "6": 0.01776123046875, + "7": 0.01528167724609375 + }, + "9": { + "0": 0.01325225830078125, + "1": 0.027191162109375, + "2": 0.021820068359375, + "3": 0.0181884765625, + "4": 0.017578125, + "5": 0.0174407958984375, + "6": 0.016815185546875, + "7": 0.0183868408203125 + }, + "10": { + "0": 0.0172576904296875, + "1": 0.0179595947265625, + "2": 0.0209503173828125, + "3": 0.0142364501953125, + "4": 0.01265716552734375, + "5": 0.013427734375, + "6": 0.0142822265625, + "7": 0.01256561279296875 + }, + "11": { + "0": 0.0167083740234375, + "1": 0.0153961181640625, + "2": 0.0220947265625, + "3": 0.02020263671875, + "4": 0.016143798828125, + "5": 0.0244903564453125, + "6": 0.022979736328125, + "7": 0.018096923828125 + }, + "12": { + "0": 0.0177154541015625, + "1": 0.0160064697265625, + "2": 0.0199737548828125, + "3": 0.015472412109375, + "4": 0.0201873779296875, + "5": 0.017852783203125, + "6": 0.019195556640625, + "7": 0.019989013671875 + }, + "13": { + "0": 0.0175323486328125, + "1": 0.0196075439453125, + "2": 0.017913818359375, + "3": 0.014923095703125, + "4": 0.021759033203125, + "5": 0.026458740234375, + "6": 0.01788330078125, + "7": 0.024383544921875 + }, + "14": { + "0": 0.01535797119140625, + "1": 0.0171966552734375, + "2": 0.022308349609375, + "3": 0.0172271728515625, + "4": 0.020660400390625, + "5": 0.018585205078125, + "6": 0.0204925537109375, + "7": 0.017059326171875 + }, + "15": { + "0": 0.01763916015625, + "1": 0.015838623046875, + "2": 0.018524169921875, + "3": 0.0154571533203125, + "4": 0.03277587890625, + "5": 0.0167999267578125, + "6": 0.0207977294921875, + "7": 0.0192108154296875 + }, + "16": { + "0": 0.01739501953125, + "1": 0.013916015625, + "2": 0.011749267578125, + "3": 0.024322509765625, + "4": 0.017364501953125, + "5": 0.0190582275390625, + "6": 0.0155181884765625, + "7": 0.01959228515625 + }, + "17": { + "0": 0.01226043701171875, + "1": 0.019134521484375, + "2": 0.01690673828125, + "3": 0.01275634765625, + "4": 0.0161590576171875, + "5": 0.01525115966796875, + "6": 0.0185089111328125, + "7": 0.0167236328125 + }, + "18": { + "0": 0.012542724609375, + "1": 0.028045654296875, + "2": 0.01983642578125, + "3": 0.017364501953125, + "4": 0.0267333984375, + "5": 0.0253448486328125, + "6": 0.0228424072265625, + "7": 0.01751708984375 + }, + "19": { + "0": 0.017852783203125, + "1": 0.0216827392578125, + "2": 0.01611328125, + "3": 0.0212249755859375, + "4": 0.0166168212890625, + "5": 0.028289794921875, + "6": 0.0165252685546875, + "7": 0.0235595703125 + }, + "20": { + "0": 0.0236663818359375, + "1": 0.0242919921875, + "2": 0.0164794921875, + "3": 0.01617431640625, + "4": 0.0223846435546875, + "5": 0.0187530517578125, + "6": 0.019287109375, + "7": 0.02032470703125 + }, + "21": { + "0": 0.013427734375, + "1": 0.0261688232421875, + "2": 0.019439697265625, + "3": 0.0163726806640625, + "4": 0.0186614990234375, + "5": 0.026153564453125, + "6": 0.0217437744140625, + "7": 0.02288818359375 + }, + "22": { + "0": 0.0301513671875, + "1": 0.01611328125, + "2": 0.01763916015625, + "3": 0.034423828125, + "4": 0.026214599609375, + "5": 0.016937255859375, + "6": 0.019134521484375, + "7": 0.0340576171875 + }, + "23": { + "0": 0.01947021484375, + "1": 0.02978515625, + "2": 0.01812744140625, + "3": 0.02069091796875, + "4": 0.01702880859375, + "5": 0.02423095703125, + "6": 0.019805908203125, + "7": 0.02557373046875 + }, + "24": { + "0": 0.0304107666015625, + "1": 0.0206451416015625, + "2": 0.0209197998046875, + "3": 0.0265350341796875, + "4": 0.0310821533203125, + "5": 0.028411865234375, + "6": 0.0433349609375, + "7": 0.028228759765625 + }, + "25": { + "0": 0.0247344970703125, + "1": 0.0333251953125, + "2": 0.0217742919921875, + "3": 0.034149169921875, + "4": 0.0210113525390625, + "5": 0.034942626953125, + "6": 0.0174713134765625, + "7": 0.0218963623046875 + }, + "26": { + "0": 0.022735595703125, + "1": 0.020111083984375, + "2": 0.01953125, + "3": 0.035980224609375, + "4": 0.02105712890625, + "5": 0.037017822265625, + "6": 0.0195159912109375, + "7": 0.026214599609375 + }, + "27": { + "0": 0.0242767333984375, + "1": 0.040924072265625, + "2": 0.0266571044921875, + "3": 0.0186920166015625, + "4": 0.0287017822265625, + "5": 0.04461669921875, + "6": 0.01934814453125, + "7": 0.0299835205078125 + }, + "28": { + "0": 0.023040771484375, + "1": 0.04510498046875, + "2": 0.028594970703125, + "3": 0.03839111328125, + "4": 0.0296630859375, + "5": 0.0262451171875, + "6": 0.028228759765625, + "7": 0.0162353515625 + }, + "29": { + "0": 0.0257568359375, + "1": 0.029205322265625, + "2": 0.0249481201171875, + "3": 0.0232086181640625, + "4": 0.0340576171875, + "5": 0.056243896484375, + "6": 0.04876708984375, + "7": 0.0292510986328125 + }, + "30": { + "0": 0.0253753662109375, + "1": 0.0239715576171875, + "2": 0.02923583984375, + "3": 0.0516357421875, + "4": 0.0257720947265625, + "5": 0.0286865234375, + "6": 0.03289794921875, + "7": 0.04339599609375 + }, + "31": { + "0": 0.032928466796875, + "1": 0.0273284912109375, + "2": 0.02728271484375, + "3": 0.051513671875, + "4": 0.02545166015625, + "5": 0.04443359375, + "6": 0.0270538330078125, + "7": 0.0199432373046875 + } + }, + "k_zero_point": { + "0": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "1": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "2": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "3": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "4": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "5": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "6": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "7": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "8": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "9": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "10": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "11": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "12": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "13": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "14": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "15": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "16": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "17": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "18": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "19": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "20": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "21": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "22": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "23": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "24": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "25": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "26": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "27": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "28": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "29": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "30": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "31": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + } + }, + "v_zero_point": { + "0": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "1": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "2": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "3": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "4": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "5": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "6": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "7": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "8": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "9": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "10": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "11": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "12": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "13": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "14": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "15": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "16": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "17": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "18": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "19": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "20": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "21": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "22": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "23": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "24": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "25": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "26": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "27": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "28": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "29": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "30": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "31": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + } + } + } + } +} \ No newline at end of file diff --git a/examples/int8/work_dir/medmcqa/kv_cache_scales_layer_level.json b/examples/int8/work_dir/medmcqa/kv_cache_scales_layer_level.json new file mode 100644 index 000000000000..301f02d498b1 --- /dev/null +++ b/examples/int8/work_dir/medmcqa/kv_cache_scales_layer_level.json @@ -0,0 +1,400 @@ +{ + "model_type": "llama", + "kv_cache": { + "dtype": "int8", + "scaling_factor": { + "k_scale": { + "0": { + "0": 0.120849609375 + }, + "1": { + "0": 0.122802734375 + }, + "2": { + "0": 0.15283203125 + }, + "3": { + "0": 0.158935546875 + }, + "4": { + "0": 0.16845703125 + }, + "5": { + "0": 0.1571044921875 + }, + "6": { + "0": 0.14501953125 + }, + "7": { + "0": 0.1658935546875 + }, + "8": { + "0": 0.177734375 + }, + "9": { + "0": 0.172119140625 + }, + "10": { + "0": 0.161865234375 + }, + "11": { + "0": 0.156005859375 + }, + "12": { + "0": 0.1600341796875 + }, + "13": { + "0": 0.1513671875 + }, + "14": { + "0": 0.154052734375 + }, + "15": { + "0": 0.148681640625 + }, + "16": { + "0": 0.1746826171875 + }, + "17": { + "0": 0.1436767578125 + }, + "18": { + "0": 0.1470947265625 + }, + "19": { + "0": 0.156494140625 + }, + "20": { + "0": 0.1448974609375 + }, + "21": { + "0": 0.1673583984375 + }, + "22": { + "0": 0.1522216796875 + }, + "23": { + "0": 0.1527099609375 + }, + "24": { + "0": 0.1624755859375 + }, + "25": { + "0": 0.1700439453125 + }, + "26": { + "0": 0.167236328125 + }, + "27": { + "0": 0.1580810546875 + }, + "28": { + "0": 0.1685791015625 + }, + "29": { + "0": 0.279296875 + }, + "30": { + "0": 0.1533203125 + }, + "31": { + "0": 0.168212890625 + } + }, + "v_scale": { + "0": { + "0": 0.0043487548828125 + }, + "1": { + "0": 0.031768798828125 + }, + "2": { + "0": 0.0150146484375 + }, + "3": { + "0": 0.018707275390625 + }, + "4": { + "0": 0.0187225341796875 + }, + "5": { + "0": 0.020416259765625 + }, + "6": { + "0": 0.032012939453125 + }, + "7": { + "0": 0.02264404296875 + }, + "8": { + "0": 0.02203369140625 + }, + "9": { + "0": 0.032928466796875 + }, + "10": { + "0": 0.0215606689453125 + }, + "11": { + "0": 0.025390625 + }, + "12": { + "0": 0.019378662109375 + }, + "13": { + "0": 0.0232696533203125 + }, + "14": { + "0": 0.0233917236328125 + }, + "15": { + "0": 0.033447265625 + }, + "16": { + "0": 0.0192108154296875 + }, + "17": { + "0": 0.022064208984375 + }, + "18": { + "0": 0.0280609130859375 + }, + "19": { + "0": 0.02813720703125 + }, + "20": { + "0": 0.0250091552734375 + }, + "21": { + "0": 0.0285491943359375 + }, + "22": { + "0": 0.0396728515625 + }, + "23": { + "0": 0.0294036865234375 + }, + "24": { + "0": 0.04754638671875 + }, + "25": { + "0": 0.034149169921875 + }, + "26": { + "0": 0.0340576171875 + }, + "27": { + "0": 0.044769287109375 + }, + "28": { + "0": 0.04180908203125 + }, + "29": { + "0": 0.058990478515625 + }, + "30": { + "0": 0.05853271484375 + }, + "31": { + "0": 0.06024169921875 + } + }, + "k_zero_point": { + "0": { + "0": 0.0 + }, + "1": { + "0": 0.0 + }, + "2": { + "0": 0.0 + }, + "3": { + "0": 0.0 + }, + "4": { + "0": 0.0 + }, + "5": { + "0": 0.0 + }, + "6": { + "0": 0.0 + }, + "7": { + "0": 0.0 + }, + "8": { + "0": 0.0 + }, + "9": { + "0": 0.0 + }, + "10": { + "0": 0.0 + }, + "11": { + "0": 0.0 + }, + "12": { + "0": 0.0 + }, + "13": { + "0": 0.0 + }, + "14": { + "0": 0.0 + }, + "15": { + "0": 0.0 + }, + "16": { + "0": 0.0 + }, + "17": { + "0": 0.0 + }, + "18": { + "0": 0.0 + }, + "19": { + "0": 0.0 + }, + "20": { + "0": 0.0 + }, + "21": { + "0": 0.0 + }, + "22": { + "0": 0.0 + }, + "23": { + "0": 0.0 + }, + "24": { + "0": 0.0 + }, + "25": { + "0": 0.0 + }, + "26": { + "0": 0.0 + }, + "27": { + "0": 0.0 + }, + "28": { + "0": 0.0 + }, + "29": { + "0": 0.0 + }, + "30": { + "0": 0.0 + }, + "31": { + "0": 0.0 + } + }, + "v_zero_point": { + "0": { + "0": 0.0 + }, + "1": { + "0": 0.0 + }, + "2": { + "0": 0.0 + }, + "3": { + "0": 0.0 + }, + "4": { + "0": 0.0 + }, + "5": { + "0": 0.0 + }, + "6": { + "0": 0.0 + }, + "7": { + "0": 0.0 + }, + "8": { + "0": 0.0 + }, + "9": { + "0": 0.0 + }, + "10": { + "0": 0.0 + }, + "11": { + "0": 0.0 + }, + "12": { + "0": 0.0 + }, + "13": { + "0": 0.0 + }, + "14": { + "0": 0.0 + }, + "15": { + "0": 0.0 + }, + "16": { + "0": 0.0 + }, + "17": { + "0": 0.0 + }, + "18": { + "0": 0.0 + }, + "19": { + "0": 0.0 + }, + "20": { + "0": 0.0 + }, + "21": { + "0": 0.0 + }, + "22": { + "0": 0.0 + }, + "23": { + "0": 0.0 + }, + "24": { + "0": 0.0 + }, + "25": { + "0": 0.0 + }, + "26": { + "0": 0.0 + }, + "27": { + "0": 0.0 + }, + "28": { + "0": 0.0 + }, + "29": { + "0": 0.0 + }, + "30": { + "0": 0.0 + }, + "31": { + "0": 0.0 + } + } + } + } +} \ No newline at end of file diff --git a/examples/int8/work_dir/medmcqa/kv_cache_scales_quant_group128.json b/examples/int8/work_dir/medmcqa/kv_cache_scales_quant_group128.json new file mode 100644 index 000000000000..8092192a384f --- /dev/null +++ b/examples/int8/work_dir/medmcqa/kv_cache_scales_quant_group128.json @@ -0,0 +1,1296 @@ +{ + "model_type": "llama", + "kv_cache": { + "dtype": "int8", + "scaling_factor": { + "k_scale": { + "0": { + "0": 0.08660888671875, + "1": 0.0772705078125, + "2": 0.08447265625, + "3": 0.060821533203125, + "4": 0.08056640625, + "5": 0.065185546875, + "6": 0.120849609375, + "7": 0.049957275390625 + }, + "1": { + "0": 0.09552001953125, + "1": 0.122802734375, + "2": 0.11395263671875, + "3": 0.07440185546875, + "4": 0.10003662109375, + "5": 0.07354736328125, + "6": 0.0953369140625, + "7": 0.0927734375 + }, + "2": { + "0": 0.1318359375, + "1": 0.14697265625, + "2": 0.1053466796875, + "3": 0.13427734375, + "4": 0.11004638671875, + "5": 0.15283203125, + "6": 0.1414794921875, + "7": 0.1259765625 + }, + "3": { + "0": 0.127685546875, + "1": 0.09637451171875, + "2": 0.148681640625, + "3": 0.1397705078125, + "4": 0.1314697265625, + "5": 0.11968994140625, + "6": 0.123779296875, + "7": 0.158935546875 + }, + "4": { + "0": 0.08221435546875, + "1": 0.1279296875, + "2": 0.1370849609375, + "3": 0.11151123046875, + "4": 0.16845703125, + "5": 0.115966796875, + "6": 0.12469482421875, + "7": 0.1265869140625 + }, + "5": { + "0": 0.1021728515625, + "1": 0.10986328125, + "2": 0.115234375, + "3": 0.10833740234375, + "4": 0.1046142578125, + "5": 0.1273193359375, + "6": 0.1571044921875, + "7": 0.10577392578125 + }, + "6": { + "0": 0.1220703125, + "1": 0.14453125, + "2": 0.1285400390625, + "3": 0.091552734375, + "4": 0.14501953125, + "5": 0.1336669921875, + "6": 0.11578369140625, + "7": 0.138916015625 + }, + "7": { + "0": 0.1014404296875, + "1": 0.160888671875, + "2": 0.1234130859375, + "3": 0.14306640625, + "4": 0.144287109375, + "5": 0.1239013671875, + "6": 0.11578369140625, + "7": 0.1658935546875 + }, + "8": { + "0": 0.121337890625, + "1": 0.114013671875, + "2": 0.177734375, + "3": 0.1527099609375, + "4": 0.1488037109375, + "5": 0.1312255859375, + "6": 0.12213134765625, + "7": 0.123046875 + }, + "9": { + "0": 0.134521484375, + "1": 0.114501953125, + "2": 0.172119140625, + "3": 0.12646484375, + "4": 0.1295166015625, + "5": 0.1241455078125, + "6": 0.097412109375, + "7": 0.1190185546875 + }, + "10": { + "0": 0.160888671875, + "1": 0.12335205078125, + "2": 0.102294921875, + "3": 0.10003662109375, + "4": 0.111328125, + "5": 0.161865234375, + "6": 0.1378173828125, + "7": 0.141845703125 + }, + "11": { + "0": 0.1453857421875, + "1": 0.12744140625, + "2": 0.13720703125, + "3": 0.125732421875, + "4": 0.11956787109375, + "5": 0.1168212890625, + "6": 0.156005859375, + "7": 0.11590576171875 + }, + "12": { + "0": 0.1376953125, + "1": 0.142578125, + "2": 0.1600341796875, + "3": 0.1107177734375, + "4": 0.142822265625, + "5": 0.1583251953125, + "6": 0.149658203125, + "7": 0.1480712890625 + }, + "13": { + "0": 0.1329345703125, + "1": 0.1513671875, + "2": 0.1290283203125, + "3": 0.1282958984375, + "4": 0.1409912109375, + "5": 0.134033203125, + "6": 0.1312255859375, + "7": 0.1298828125 + }, + "14": { + "0": 0.136474609375, + "1": 0.1319580078125, + "2": 0.154052734375, + "3": 0.148193359375, + "4": 0.151123046875, + "5": 0.148193359375, + "6": 0.11151123046875, + "7": 0.137939453125 + }, + "15": { + "0": 0.12890625, + "1": 0.145751953125, + "2": 0.1329345703125, + "3": 0.1441650390625, + "4": 0.09033203125, + "5": 0.148681640625, + "6": 0.1429443359375, + "7": 0.1087646484375 + }, + "16": { + "0": 0.1474609375, + "1": 0.156982421875, + "2": 0.139892578125, + "3": 0.1746826171875, + "4": 0.12445068359375, + "5": 0.1187744140625, + "6": 0.11993408203125, + "7": 0.146484375 + }, + "17": { + "0": 0.13134765625, + "1": 0.142578125, + "2": 0.09649658203125, + "3": 0.1300048828125, + "4": 0.1136474609375, + "5": 0.1436767578125, + "6": 0.134521484375, + "7": 0.1424560546875 + }, + "18": { + "0": 0.11810302734375, + "1": 0.1148681640625, + "2": 0.12347412109375, + "3": 0.1246337890625, + "4": 0.1368408203125, + "5": 0.12176513671875, + "6": 0.1470947265625, + "7": 0.1444091796875 + }, + "19": { + "0": 0.11029052734375, + "1": 0.13916015625, + "2": 0.156494140625, + "3": 0.1080322265625, + "4": 0.1002197265625, + "5": 0.11083984375, + "6": 0.1370849609375, + "7": 0.10711669921875 + }, + "20": { + "0": 0.09991455078125, + "1": 0.14306640625, + "2": 0.12371826171875, + "3": 0.10748291015625, + "4": 0.12396240234375, + "5": 0.1448974609375, + "6": 0.1243896484375, + "7": 0.11737060546875 + }, + "21": { + "0": 0.131591796875, + "1": 0.10723876953125, + "2": 0.162353515625, + "3": 0.1279296875, + "4": 0.1546630859375, + "5": 0.117431640625, + "6": 0.1673583984375, + "7": 0.11529541015625 + }, + "22": { + "0": 0.1201171875, + "1": 0.1522216796875, + "2": 0.1361083984375, + "3": 0.124755859375, + "4": 0.10540771484375, + "5": 0.1356201171875, + "6": 0.131103515625, + "7": 0.1365966796875 + }, + "23": { + "0": 0.1527099609375, + "1": 0.110107421875, + "2": 0.131103515625, + "3": 0.125244140625, + "4": 0.1494140625, + "5": 0.11151123046875, + "6": 0.1448974609375, + "7": 0.14697265625 + }, + "24": { + "0": 0.12890625, + "1": 0.1519775390625, + "2": 0.1624755859375, + "3": 0.144775390625, + "4": 0.109619140625, + "5": 0.1282958984375, + "6": 0.130615234375, + "7": 0.11474609375 + }, + "25": { + "0": 0.103515625, + "1": 0.11474609375, + "2": 0.1319580078125, + "3": 0.10357666015625, + "4": 0.1700439453125, + "5": 0.163330078125, + "6": 0.1607666015625, + "7": 0.156982421875 + }, + "26": { + "0": 0.167236328125, + "1": 0.1502685546875, + "2": 0.1112060546875, + "3": 0.10662841796875, + "4": 0.153076171875, + "5": 0.143310546875, + "6": 0.1522216796875, + "7": 0.1302490234375 + }, + "27": { + "0": 0.1573486328125, + "1": 0.1109619140625, + "2": 0.1575927734375, + "3": 0.132080078125, + "4": 0.14599609375, + "5": 0.118896484375, + "6": 0.1580810546875, + "7": 0.11529541015625 + }, + "28": { + "0": 0.1461181640625, + "1": 0.1492919921875, + "2": 0.1685791015625, + "3": 0.12249755859375, + "4": 0.1383056640625, + "5": 0.1396484375, + "6": 0.12164306640625, + "7": 0.13720703125 + }, + "29": { + "0": 0.1474609375, + "1": 0.1705322265625, + "2": 0.135498046875, + "3": 0.151611328125, + "4": 0.12054443359375, + "5": 0.101806640625, + "6": 0.279296875, + "7": 0.154541015625 + }, + "30": { + "0": 0.11895751953125, + "1": 0.1533203125, + "2": 0.133056640625, + "3": 0.126953125, + "4": 0.11529541015625, + "5": 0.1297607421875, + "6": 0.148193359375, + "7": 0.127197265625 + }, + "31": { + "0": 0.168212890625, + "1": 0.114013671875, + "2": 0.1439208984375, + "3": 0.136962890625, + "4": 0.1204833984375, + "5": 0.12396240234375, + "6": 0.13671875, + "7": 0.11614990234375 + } + }, + "v_scale": { + "0": { + "0": 0.0029659271240234375, + "1": 0.003971099853515625, + "2": 0.003414154052734375, + "3": 0.002643585205078125, + "4": 0.0043487548828125, + "5": 0.0025272369384765625, + "6": 0.0037136077880859375, + "7": 0.004016876220703125 + }, + "1": { + "0": 0.00847625732421875, + "1": 0.00461578369140625, + "2": 0.01242828369140625, + "3": 0.004978179931640625, + "4": 0.006168365478515625, + "5": 0.0064239501953125, + "6": 0.00742340087890625, + "7": 0.031768798828125 + }, + "2": { + "0": 0.01033782958984375, + "1": 0.01303863525390625, + "2": 0.0150146484375, + "3": 0.01058197021484375, + "4": 0.0097808837890625, + "5": 0.01116943359375, + "6": 0.013763427734375, + "7": 0.01071929931640625 + }, + "3": { + "0": 0.0109710693359375, + "1": 0.0140838623046875, + "2": 0.014129638671875, + "3": 0.0092926025390625, + "4": 0.018707275390625, + "5": 0.017669677734375, + "6": 0.01276397705078125, + "7": 0.0162200927734375 + }, + "4": { + "0": 0.0187225341796875, + "1": 0.01049041748046875, + "2": 0.0121612548828125, + "3": 0.01528167724609375, + "4": 0.01213836669921875, + "5": 0.0102386474609375, + "6": 0.01453399658203125, + "7": 0.0114288330078125 + }, + "5": { + "0": 0.020416259765625, + "1": 0.01222991943359375, + "2": 0.01708984375, + "3": 0.013702392578125, + "4": 0.01094818115234375, + "5": 0.01456451416015625, + "6": 0.01474761962890625, + "7": 0.0168609619140625 + }, + "6": { + "0": 0.022308349609375, + "1": 0.01385498046875, + "2": 0.01641845703125, + "3": 0.0199127197265625, + "4": 0.0190277099609375, + "5": 0.0156402587890625, + "6": 0.032012939453125, + "7": 0.01316070556640625 + }, + "7": { + "0": 0.018829345703125, + "1": 0.02264404296875, + "2": 0.01436614990234375, + "3": 0.01154327392578125, + "4": 0.01702880859375, + "5": 0.01355743408203125, + "6": 0.0137176513671875, + "7": 0.020599365234375 + }, + "8": { + "0": 0.0136871337890625, + "1": 0.02203369140625, + "2": 0.01447296142578125, + "3": 0.01605224609375, + "4": 0.018310546875, + "5": 0.0153350830078125, + "6": 0.01393890380859375, + "7": 0.014129638671875 + }, + "9": { + "0": 0.0133209228515625, + "1": 0.032928466796875, + "2": 0.0215301513671875, + "3": 0.01788330078125, + "4": 0.0189208984375, + "5": 0.01776123046875, + "6": 0.015960693359375, + "7": 0.0211029052734375 + }, + "10": { + "0": 0.0215606689453125, + "1": 0.01837158203125, + "2": 0.0194854736328125, + "3": 0.01348114013671875, + "4": 0.015838623046875, + "5": 0.0142059326171875, + "6": 0.01328277587890625, + "7": 0.01287078857421875 + }, + "11": { + "0": 0.013916015625, + "1": 0.0149383544921875, + "2": 0.0225372314453125, + "3": 0.0179443359375, + "4": 0.0167083740234375, + "5": 0.025390625, + "6": 0.0232086181640625, + "7": 0.0167083740234375 + }, + "12": { + "0": 0.0174102783203125, + "1": 0.0187225341796875, + "2": 0.019378662109375, + "3": 0.016357421875, + "4": 0.01861572265625, + "5": 0.0160675048828125, + "6": 0.01629638671875, + "7": 0.0179901123046875 + }, + "13": { + "0": 0.0167694091796875, + "1": 0.0195465087890625, + "2": 0.01922607421875, + "3": 0.014068603515625, + "4": 0.0232696533203125, + "5": 0.0199127197265625, + "6": 0.018402099609375, + "7": 0.0228424072265625 + }, + "14": { + "0": 0.014678955078125, + "1": 0.0179443359375, + "2": 0.0233917236328125, + "3": 0.01441192626953125, + "4": 0.0186309814453125, + "5": 0.0172119140625, + "6": 0.022064208984375, + "7": 0.0172271728515625 + }, + "15": { + "0": 0.0164031982421875, + "1": 0.0140380859375, + "2": 0.0165252685546875, + "3": 0.0141143798828125, + "4": 0.033447265625, + "5": 0.01549530029296875, + "6": 0.0190887451171875, + "7": 0.0178680419921875 + }, + "16": { + "0": 0.0174560546875, + "1": 0.01372528076171875, + "2": 0.01229095458984375, + "3": 0.0192108154296875, + "4": 0.0166778564453125, + "5": 0.0188446044921875, + "6": 0.0143585205078125, + "7": 0.0180206298828125 + }, + "17": { + "0": 0.014495849609375, + "1": 0.0176239013671875, + "2": 0.0178375244140625, + "3": 0.01160430908203125, + "4": 0.019195556640625, + "5": 0.0178070068359375, + "6": 0.022064208984375, + "7": 0.015655517578125 + }, + "18": { + "0": 0.01263427734375, + "1": 0.0273590087890625, + "2": 0.0196685791015625, + "3": 0.01497650146484375, + "4": 0.0278167724609375, + "5": 0.0280609130859375, + "6": 0.02093505859375, + "7": 0.0171966552734375 + }, + "19": { + "0": 0.0160369873046875, + "1": 0.0197296142578125, + "2": 0.015350341796875, + "3": 0.0191497802734375, + "4": 0.01480865478515625, + "5": 0.02813720703125, + "6": 0.0171661376953125, + "7": 0.024444580078125 + }, + "20": { + "0": 0.0200958251953125, + "1": 0.0250091552734375, + "2": 0.01971435546875, + "3": 0.017730712890625, + "4": 0.021484375, + "5": 0.02166748046875, + "6": 0.0182647705078125, + "7": 0.0196380615234375 + }, + "21": { + "0": 0.01885986328125, + "1": 0.0285491943359375, + "2": 0.019622802734375, + "3": 0.0174713134765625, + "4": 0.0200653076171875, + "5": 0.0234222412109375, + "6": 0.016998291015625, + "7": 0.0201263427734375 + }, + "22": { + "0": 0.027801513671875, + "1": 0.0157470703125, + "2": 0.0162506103515625, + "3": 0.0396728515625, + "4": 0.021392822265625, + "5": 0.017303466796875, + "6": 0.0178680419921875, + "7": 0.0277862548828125 + }, + "23": { + "0": 0.01934814453125, + "1": 0.027618408203125, + "2": 0.02386474609375, + "3": 0.024139404296875, + "4": 0.0245361328125, + "5": 0.0228729248046875, + "6": 0.018524169921875, + "7": 0.0294036865234375 + }, + "24": { + "0": 0.02069091796875, + "1": 0.0185394287109375, + "2": 0.020965576171875, + "3": 0.0240631103515625, + "4": 0.0311737060546875, + "5": 0.028594970703125, + "6": 0.04754638671875, + "7": 0.0299072265625 + }, + "25": { + "0": 0.0215911865234375, + "1": 0.032440185546875, + "2": 0.023101806640625, + "3": 0.03173828125, + "4": 0.0209503173828125, + "5": 0.034149169921875, + "6": 0.0172271728515625, + "7": 0.023712158203125 + }, + "26": { + "0": 0.02093505859375, + "1": 0.0275115966796875, + "2": 0.0191192626953125, + "3": 0.0340576171875, + "4": 0.019775390625, + "5": 0.0281524658203125, + "6": 0.01824951171875, + "7": 0.027496337890625 + }, + "27": { + "0": 0.0250701904296875, + "1": 0.04193115234375, + "2": 0.031219482421875, + "3": 0.0193023681640625, + "4": 0.0240478515625, + "5": 0.044769287109375, + "6": 0.020904541015625, + "7": 0.0261688232421875 + }, + "28": { + "0": 0.02288818359375, + "1": 0.04180908203125, + "2": 0.035614013671875, + "3": 0.036590576171875, + "4": 0.032562255859375, + "5": 0.0239105224609375, + "6": 0.0291900634765625, + "7": 0.015716552734375 + }, + "29": { + "0": 0.0214691162109375, + "1": 0.029052734375, + "2": 0.03997802734375, + "3": 0.0264129638671875, + "4": 0.033935546875, + "5": 0.058990478515625, + "6": 0.04669189453125, + "7": 0.0278472900390625 + }, + "30": { + "0": 0.0240020751953125, + "1": 0.0245513916015625, + "2": 0.0372314453125, + "3": 0.05853271484375, + "4": 0.0262451171875, + "5": 0.027801513671875, + "6": 0.036346435546875, + "7": 0.04620361328125 + }, + "31": { + "0": 0.030059814453125, + "1": 0.0284576416015625, + "2": 0.0281982421875, + "3": 0.06024169921875, + "4": 0.0291595458984375, + "5": 0.045074462890625, + "6": 0.03759765625, + "7": 0.0226287841796875 + } + }, + "k_zero_point": { + "0": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "1": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "2": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "3": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "4": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "5": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "6": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "7": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "8": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "9": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "10": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "11": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "12": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "13": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "14": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "15": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "16": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "17": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "18": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "19": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "20": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "21": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "22": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "23": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "24": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "25": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "26": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "27": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "28": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "29": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "30": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "31": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + } + }, + "v_zero_point": { + "0": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "1": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "2": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "3": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "4": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "5": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "6": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "7": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "8": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "9": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "10": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "11": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "12": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "13": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "14": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "15": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "16": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "17": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "18": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "19": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "20": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "21": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "22": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "23": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "24": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "25": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "26": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "27": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "28": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "29": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "30": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + }, + "31": { + "0": 0.0, + "1": 0.0, + "2": 0.0, + "3": 0.0, + "4": 0.0, + "5": 0.0, + "6": 0.0, + "7": 0.0 + } + } + } + } +} \ No newline at end of file diff --git a/examples/int8/work_dir/medqa/kv_cache_scales_layer_level.json b/examples/int8/work_dir/medqa/kv_cache_scales_layer_level.json new file mode 100644 index 000000000000..f8f5c28b64c6 --- /dev/null +++ b/examples/int8/work_dir/medqa/kv_cache_scales_layer_level.json @@ -0,0 +1,400 @@ +{ + "model_type": "llama", + "kv_cache": { + "dtype": "int8", + "scaling_factor": { + "k_scale": { + "0": { + "0": 0.1224365234375 + }, + "1": { + "0": 0.11419677734375 + }, + "2": { + "0": 0.1402587890625 + }, + "3": { + "0": 0.1531982421875 + }, + "4": { + "0": 0.1619873046875 + }, + "5": { + "0": 0.151123046875 + }, + "6": { + "0": 0.14501953125 + }, + "7": { + "0": 0.160400390625 + }, + "8": { + "0": 0.16259765625 + }, + "9": { + "0": 0.1693115234375 + }, + "10": { + "0": 0.1617431640625 + }, + "11": { + "0": 0.157958984375 + }, + "12": { + "0": 0.1585693359375 + }, + "13": { + "0": 0.1397705078125 + }, + "14": { + "0": 0.1500244140625 + }, + "15": { + "0": 0.1456298828125 + }, + "16": { + "0": 0.1715087890625 + }, + "17": { + "0": 0.146240234375 + }, + "18": { + "0": 0.14599609375 + }, + "19": { + "0": 0.1573486328125 + }, + "20": { + "0": 0.1424560546875 + }, + "21": { + "0": 0.1644287109375 + }, + "22": { + "0": 0.1575927734375 + }, + "23": { + "0": 0.1531982421875 + }, + "24": { + "0": 0.158203125 + }, + "25": { + "0": 0.1683349609375 + }, + "26": { + "0": 0.17041015625 + }, + "27": { + "0": 0.158447265625 + }, + "28": { + "0": 0.165283203125 + }, + "29": { + "0": 0.271728515625 + }, + "30": { + "0": 0.1478271484375 + }, + "31": { + "0": 0.1695556640625 + } + }, + "v_scale": { + "0": { + "0": 0.003971099853515625 + }, + "1": { + "0": 0.026824951171875 + }, + "2": { + "0": 0.01462554931640625 + }, + "3": { + "0": 0.0189666748046875 + }, + "4": { + "0": 0.01531219482421875 + }, + "5": { + "0": 0.0182342529296875 + }, + "6": { + "0": 0.0256195068359375 + }, + "7": { + "0": 0.0216827392578125 + }, + "8": { + "0": 0.0185089111328125 + }, + "9": { + "0": 0.0261688232421875 + }, + "10": { + "0": 0.0210723876953125 + }, + "11": { + "0": 0.0270843505859375 + }, + "12": { + "0": 0.01995849609375 + }, + "13": { + "0": 0.0227203369140625 + }, + "14": { + "0": 0.02325439453125 + }, + "15": { + "0": 0.03546142578125 + }, + "16": { + "0": 0.0186614990234375 + }, + "17": { + "0": 0.0208587646484375 + }, + "18": { + "0": 0.0313720703125 + }, + "19": { + "0": 0.0240936279296875 + }, + "20": { + "0": 0.0233154296875 + }, + "21": { + "0": 0.02655029296875 + }, + "22": { + "0": 0.03662109375 + }, + "23": { + "0": 0.0283660888671875 + }, + "24": { + "0": 0.044097900390625 + }, + "25": { + "0": 0.03369140625 + }, + "26": { + "0": 0.036865234375 + }, + "27": { + "0": 0.03985595703125 + }, + "28": { + "0": 0.0416259765625 + }, + "29": { + "0": 0.05572509765625 + }, + "30": { + "0": 0.049285888671875 + }, + "31": { + "0": 0.043701171875 + } + }, + "k_zero_point": { + "0": { + "0": 0.0 + }, + "1": { + "0": 0.0 + }, + "2": { + "0": 0.0 + }, + "3": { + "0": 0.0 + }, + "4": { + "0": 0.0 + }, + "5": { + "0": 0.0 + }, + "6": { + "0": 0.0 + }, + "7": { + "0": 0.0 + }, + "8": { + "0": 0.0 + }, + "9": { + "0": 0.0 + }, + "10": { + "0": 0.0 + }, + "11": { + "0": 0.0 + }, + "12": { + "0": 0.0 + }, + "13": { + "0": 0.0 + }, + "14": { + "0": 0.0 + }, + "15": { + "0": 0.0 + }, + "16": { + "0": 0.0 + }, + "17": { + "0": 0.0 + }, + "18": { + "0": 0.0 + }, + "19": { + "0": 0.0 + }, + "20": { + "0": 0.0 + }, + "21": { + "0": 0.0 + }, + "22": { + "0": 0.0 + }, + "23": { + "0": 0.0 + }, + "24": { + "0": 0.0 + }, + "25": { + "0": 0.0 + }, + "26": { + "0": 0.0 + }, + "27": { + "0": 0.0 + }, + "28": { + "0": 0.0 + }, + "29": { + "0": 0.0 + }, + "30": { + "0": 0.0 + }, + "31": { + "0": 0.0 + } + }, + "v_zero_point": { + "0": { + "0": 0.0 + }, + "1": { + "0": 0.0 + }, + "2": { + "0": 0.0 + }, + "3": { + "0": 0.0 + }, + "4": { + "0": 0.0 + }, + "5": { + "0": 0.0 + }, + "6": { + "0": 0.0 + }, + "7": { + "0": 0.0 + }, + "8": { + "0": 0.0 + }, + "9": { + "0": 0.0 + }, + "10": { + "0": 0.0 + }, + "11": { + "0": 0.0 + }, + "12": { + "0": 0.0 + }, + "13": { + "0": 0.0 + }, + "14": { + "0": 0.0 + }, + "15": { + "0": 0.0 + }, + "16": { + "0": 0.0 + }, + "17": { + "0": 0.0 + }, + "18": { + "0": 0.0 + }, + "19": { + "0": 0.0 + }, + "20": { + "0": 0.0 + }, + "21": { + "0": 0.0 + }, + "22": { + "0": 0.0 + }, + "23": { + "0": 0.0 + }, + "24": { + "0": 0.0 + }, + "25": { + "0": 0.0 + }, + "26": { + "0": 0.0 + }, + "27": { + "0": 0.0 + }, + "28": { + "0": 0.0 + }, + "29": { + "0": 0.0 + }, + "30": { + "0": 0.0 + }, + "31": { + "0": 0.0 + } + } + } + } +} \ No newline at end of file diff --git a/examples/int8/work_dir/mmlu/kv_cache_scales_layer_level.json b/examples/int8/work_dir/mmlu/kv_cache_scales_layer_level.json new file mode 100644 index 000000000000..b8710b4d49a1 --- /dev/null +++ b/examples/int8/work_dir/mmlu/kv_cache_scales_layer_level.json @@ -0,0 +1,400 @@ +{ + "model_type": "llama", + "kv_cache": { + "dtype": "int8", + "scaling_factor": { + "k_scale": { + "0": { + "0": 0.1231689453125 + }, + "1": { + "0": 0.1207275390625 + }, + "2": { + "0": 0.1434326171875 + }, + "3": { + "0": 0.164794921875 + }, + "4": { + "0": 0.1737060546875 + }, + "5": { + "0": 0.1551513671875 + }, + "6": { + "0": 0.1473388671875 + }, + "7": { + "0": 0.165283203125 + }, + "8": { + "0": 0.16162109375 + }, + "9": { + "0": 0.175537109375 + }, + "10": { + "0": 0.1663818359375 + }, + "11": { + "0": 0.16943359375 + }, + "12": { + "0": 0.166259765625 + }, + "13": { + "0": 0.158935546875 + }, + "14": { + "0": 0.159912109375 + }, + "15": { + "0": 0.159912109375 + }, + "16": { + "0": 0.176025390625 + }, + "17": { + "0": 0.1488037109375 + }, + "18": { + "0": 0.1595458984375 + }, + "19": { + "0": 0.15966796875 + }, + "20": { + "0": 0.1533203125 + }, + "21": { + "0": 0.1688232421875 + }, + "22": { + "0": 0.1658935546875 + }, + "23": { + "0": 0.162109375 + }, + "24": { + "0": 0.1641845703125 + }, + "25": { + "0": 0.173583984375 + }, + "26": { + "0": 0.1690673828125 + }, + "27": { + "0": 0.1658935546875 + }, + "28": { + "0": 0.17578125 + }, + "29": { + "0": 0.284423828125 + }, + "30": { + "0": 0.1527099609375 + }, + "31": { + "0": 0.1712646484375 + } + }, + "v_scale": { + "0": { + "0": 0.0043487548828125 + }, + "1": { + "0": 0.03277587890625 + }, + "2": { + "0": 0.01546478271484375 + }, + "3": { + "0": 0.020416259765625 + }, + "4": { + "0": 0.0179901123046875 + }, + "5": { + "0": 0.019989013671875 + }, + "6": { + "0": 0.026153564453125 + }, + "7": { + "0": 0.024322509765625 + }, + "8": { + "0": 0.0236053466796875 + }, + "9": { + "0": 0.03265380859375 + }, + "10": { + "0": 0.0229949951171875 + }, + "11": { + "0": 0.0252532958984375 + }, + "12": { + "0": 0.0211181640625 + }, + "13": { + "0": 0.0229339599609375 + }, + "14": { + "0": 0.023834228515625 + }, + "15": { + "0": 0.032012939453125 + }, + "16": { + "0": 0.0228118896484375 + }, + "17": { + "0": 0.0221710205078125 + }, + "18": { + "0": 0.0302886962890625 + }, + "19": { + "0": 0.024627685546875 + }, + "20": { + "0": 0.0256195068359375 + }, + "21": { + "0": 0.027099609375 + }, + "22": { + "0": 0.039886474609375 + }, + "23": { + "0": 0.0297698974609375 + }, + "24": { + "0": 0.0445556640625 + }, + "25": { + "0": 0.033538818359375 + }, + "26": { + "0": 0.03857421875 + }, + "27": { + "0": 0.04254150390625 + }, + "28": { + "0": 0.04425048828125 + }, + "29": { + "0": 0.0560302734375 + }, + "30": { + "0": 0.059844970703125 + }, + "31": { + "0": 0.06732177734375 + } + }, + "k_zero_point": { + "0": { + "0": 0.0 + }, + "1": { + "0": 0.0 + }, + "2": { + "0": 0.0 + }, + "3": { + "0": 0.0 + }, + "4": { + "0": 0.0 + }, + "5": { + "0": 0.0 + }, + "6": { + "0": 0.0 + }, + "7": { + "0": 0.0 + }, + "8": { + "0": 0.0 + }, + "9": { + "0": 0.0 + }, + "10": { + "0": 0.0 + }, + "11": { + "0": 0.0 + }, + "12": { + "0": 0.0 + }, + "13": { + "0": 0.0 + }, + "14": { + "0": 0.0 + }, + "15": { + "0": 0.0 + }, + "16": { + "0": 0.0 + }, + "17": { + "0": 0.0 + }, + "18": { + "0": 0.0 + }, + "19": { + "0": 0.0 + }, + "20": { + "0": 0.0 + }, + "21": { + "0": 0.0 + }, + "22": { + "0": 0.0 + }, + "23": { + "0": 0.0 + }, + "24": { + "0": 0.0 + }, + "25": { + "0": 0.0 + }, + "26": { + "0": 0.0 + }, + "27": { + "0": 0.0 + }, + "28": { + "0": 0.0 + }, + "29": { + "0": 0.0 + }, + "30": { + "0": 0.0 + }, + "31": { + "0": 0.0 + } + }, + "v_zero_point": { + "0": { + "0": 0.0 + }, + "1": { + "0": 0.0 + }, + "2": { + "0": 0.0 + }, + "3": { + "0": 0.0 + }, + "4": { + "0": 0.0 + }, + "5": { + "0": 0.0 + }, + "6": { + "0": 0.0 + }, + "7": { + "0": 0.0 + }, + "8": { + "0": 0.0 + }, + "9": { + "0": 0.0 + }, + "10": { + "0": 0.0 + }, + "11": { + "0": 0.0 + }, + "12": { + "0": 0.0 + }, + "13": { + "0": 0.0 + }, + "14": { + "0": 0.0 + }, + "15": { + "0": 0.0 + }, + "16": { + "0": 0.0 + }, + "17": { + "0": 0.0 + }, + "18": { + "0": 0.0 + }, + "19": { + "0": 0.0 + }, + "20": { + "0": 0.0 + }, + "21": { + "0": 0.0 + }, + "22": { + "0": 0.0 + }, + "23": { + "0": 0.0 + }, + "24": { + "0": 0.0 + }, + "25": { + "0": 0.0 + }, + "26": { + "0": 0.0 + }, + "27": { + "0": 0.0 + }, + "28": { + "0": 0.0 + }, + "29": { + "0": 0.0 + }, + "30": { + "0": 0.0 + }, + "31": { + "0": 0.0 + } + } + } + } +} \ No newline at end of file diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index d04cbbc0a9ee..82507baf6ac3 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -48,8 +48,9 @@ def paged_attention_v1( max_seq_len: int, alibi_slopes: Optional[torch.Tensor], kv_cache_dtype: str, - k_scale: float, - v_scale: float, + quant_group: Optional[int], + k_scales: torch.Tensor, + v_scales: torch.Tensor, tp_rank: int = 0, blocksparse_local_blocks: int = 0, blocksparse_vert_stride: int = 0, @@ -59,7 +60,8 @@ def paged_attention_v1( torch.ops._C.paged_attention_v1( out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype, - k_scale, v_scale, tp_rank, blocksparse_local_blocks, + quant_group, k_scales, v_scales, + tp_rank, blocksparse_local_blocks, blocksparse_vert_stride, blocksparse_block_size, blocksparse_head_sliding_step) @@ -80,8 +82,9 @@ def paged_attention_v2( max_seq_len: int, alibi_slopes: Optional[torch.Tensor], kv_cache_dtype: str, - k_scale: float, - v_scale: float, + quant_group: Optional[int], + k_scales: torch.Tensor, + v_scales: torch.Tensor, tp_rank: int = 0, blocksparse_local_blocks: int = 0, blocksparse_vert_stride: int = 0, @@ -91,8 +94,9 @@ def paged_attention_v2( torch.ops._C.paged_attention_v2( out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len, - alibi_slopes, kv_cache_dtype, k_scale, v_scale, tp_rank, - blocksparse_local_blocks, blocksparse_vert_stride, + alibi_slopes, kv_cache_dtype, + quant_group, k_scales, v_scales, + tp_rank, blocksparse_local_blocks, blocksparse_vert_stride, blocksparse_block_size, blocksparse_head_sliding_step) @@ -956,12 +960,16 @@ def reshape_and_cache( value_cache: torch.Tensor, slot_mapping: torch.Tensor, kv_cache_dtype: str, - k_scale: float, - v_scale: float, + quant_group: Optional[int], + k_scales: torch.Tensor, + v_scales: torch.Tensor, ) -> None: torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, - kv_cache_dtype, k_scale, v_scale) + kv_cache_dtype, + quant_group, + k_scales, + v_scales) def reshape_and_cache_flash( @@ -971,13 +979,16 @@ def reshape_and_cache_flash( value_cache: torch.Tensor, slot_mapping: torch.Tensor, kv_cache_dtype: str, - k_scale: float, - v_scale: float, + quant_group: Optional[int], + k_scales: torch.Tensor, + v_scales: torch.Tensor, ) -> None: torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache, value_cache, slot_mapping, - kv_cache_dtype, k_scale, - v_scale) + kv_cache_dtype, + quant_group, + k_scales, + v_scales,) def copy_blocks(key_caches: List[torch.Tensor], diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 28b804f765a3..d97fcf29bde9 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -203,8 +203,9 @@ def reshape_and_cache( value_cache: torch.Tensor, slot_mapping: torch.Tensor, kv_cache_dtype: str, - k_scale: float, - v_scale: float, + quant_group: Optional[int], + k_scales: torch.Tensor, + v_scales: torch.Tensor, ) -> None: assert kv_cache_dtype == "auto" ipex.llm.modules.PagedAttention.reshape_and_cache( diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index 9089db1126c9..042c387d8c99 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -401,8 +401,9 @@ def forward( value_cache, attn_metadata.slot_mapping, self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, + layer._quant_group, + layer._k_scales, + layer._v_scales, ) if prefill_meta := attn_metadata.prefill_metadata: @@ -439,8 +440,9 @@ def forward( self.num_kv_heads, self.scale, self.alibi_slopes, - layer._k_scale, - layer._v_scale, + layer._quant_group, + layer._k_scales, + layer._v_scales, tp_rank=self.tp_rank, blocksparse_local_blocks=self.local_blocks, blocksparse_vert_stride=self.vert_stride, diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 60ed09d0cc44..5b76aced2b8e 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -658,7 +658,7 @@ def forward( NOTE: It in-place updates the output tensor. """ # NOTE(woosuk): FlashAttention does not support FP8 KV cache. - assert layer._k_scale == 1.0 and layer._v_scale == 1.0, ( + assert layer._k_scales.shape != torch.Size([]) and layer._v_scales.shape != torch.Size([]), ( "key/v_scale is not supported in FlashAttention.") assert output is not None, "Output tensor must be provided." @@ -710,8 +710,9 @@ def forward( kv_cache[1], updated_slot_mapping.flatten(), # type: ignore[union-attr] kv_cache_dtype, - layer._k_scale, - layer._v_scale, + layer._quant_group, + layer._k_scales, + layer._v_scales, ) (num_prefill_query_tokens, num_prefill_kv_tokens, @@ -907,3 +908,4 @@ def _get_causal_option(attn_type: str) -> bool: return not (attn_type == AttentionType.ENCODER or attn_type == AttentionType.ENCODER_ONLY or attn_type == AttentionType.ENCODER_DECODER) + diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index b8ffbe6dd64d..36e69a625612 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -828,8 +828,9 @@ def forward( kv_cache[:, 1], attn_metadata.slot_mapping.flatten(), kv_cache_dtype, - layer._k_scale, - layer._v_scale, + layer._quant_group, + layer._k_scales, + layer._v_scales, ) # The FlashInfer api requires data to be in fp8_e4m3 or fp8_e5m2 # to process the cache when the kv_cache_dtype is fp8 @@ -888,8 +889,9 @@ def forward( kv_cache, logits_soft_cap=logits_soft_cap, causal=True, - k_scale=layer._k_scale, - v_scale=layer._v_scale, + quant_group=layer._quant_group, + k_scale=layer._k_scales, + v_scale=layer._v_scales, window_left=window_left) if decode_meta := attn_metadata.decode_metadata: assert decode_meta is not None @@ -899,8 +901,9 @@ def forward( kv_cache, sm_scale=softmax_scale, logits_soft_cap=logits_soft_cap, - k_scale=layer._k_scale, - v_scale=layer._v_scale, + quant_group=layer._quant_group, + k_scale=layer._k_scales, + v_scale=layer._v_scales, window_left=window_left) if prefill_output is None and decode_output is not None: diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index cd729a1c8b27..c409c4f96de1 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -193,7 +193,9 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - assert layer._k_scale == 1.0 and layer._v_scale == 1.0 + # assert layer._k_scales[0] == 1.0 and layer._v_scales[0] == 1.0 + assert layer._k_scales.shape != torch.Size([]) and layer._v_scales.shape != torch.Size([]) + num_tokens, hidden_size = query.shape # Reshape the query, key, and value tensors. query = query.view(-1, self.num_heads, self.head_size) @@ -210,8 +212,9 @@ def forward( value_cache, attn_metadata.slot_mapping.flatten(), self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, + layer._quant_group, + layer._k_scales, + layer._v_scales, ) if attn_metadata.is_prompt: @@ -296,8 +299,9 @@ def forward( max_seq_len, self.alibi_slopes, self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, + layer._quant_group, + layer._k_scales, + layer._v_scales, ) else: # Run PagedAttention V2. @@ -329,8 +333,9 @@ def forward( max_seq_len, self.alibi_slopes, self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, + layer._quant_group, + layer._k_scales, + layer._v_scales, ) # Reshape the output tensor. diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index e9f2808ff167..ae8b1ac12232 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -457,9 +457,10 @@ def forward( key_cache, value_cache, attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, + self.kv_cache_dtype,\ + layer._quant_group, + layer._k_scales, + layer._v_scales, ) num_prefill_tokens = attn_metadata.num_prefill_tokens @@ -567,8 +568,9 @@ def forward( prefill_meta.max_query_len, self.alibi_slopes, self.sliding_window[0], - layer._k_scale, - layer._v_scale, + layer._quant_group, + layer._k_scales, + layer._v_scales, ) if decode_meta := attn_metadata.decode_metadata: @@ -613,8 +615,9 @@ def forward( max_seq_len, self.alibi_slopes, self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, + layer._quant_group, + layer._k_scales, + layer._v_scales, ) else: output[num_prefill_tokens:] = PagedAttention.forward_decode( @@ -628,8 +631,9 @@ def forward( self.num_kv_heads, self.scale, self.alibi_slopes, - layer._k_scale, - layer._v_scale, + layer._quant_group, + layer._k_scales, + layer._v_scales, ) # Reshape the output tensor. diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 8722d7376795..e5c8ed2008e7 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -454,7 +454,9 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - assert layer._k_scale == 1.0 and layer._v_scale == 1.0 + # assert layer._k_scales[0] == 1.0 and layer._v_scales[0] == 1.0 + assert layer._k_scales.shape != torch.Size([]) and layer._v_scales.shape != torch.Size([]) + attn_type = self.attn_type if (attn_type == AttentionType.ENCODER and (not attn_metadata.is_all_encoder_attn_metadata_set)): @@ -498,7 +500,7 @@ def forward( PagedAttention.write_to_paged_cache( key, value, key_cache, value_cache, updated_slot_mapping, - self.kv_cache_dtype, layer._k_scale, layer._v_scale) + self.kv_cache_dtype, layer._quant_group, layer._k_scales, layer._v_scales) if attn_type != AttentionType.ENCODER: # Decoder self-attention supports chunked prefill. @@ -572,8 +574,9 @@ def forward( self.num_kv_heads, self.scale, self.alibi_slopes, - layer._k_scale, - layer._v_scale, + layer._quant_group, + layer._k_scales, + layer._v_scales, ) # Reshape the output tensor. diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 38e27434dab2..cfdbe911b58a 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -526,7 +526,7 @@ def forward( # profiling run. PagedAttention.write_to_paged_cache( key, value, key_cache, value_cache, updated_slot_mapping, - self.kv_cache_dtype, layer._k_scale, layer._v_scale) + self.kv_cache_dtype, layer._quant_group, layer._k_scales, layer._v_scales) (num_prefill_query_tokens, num_prefill_kv_tokens, num_decode_query_tokens) = \ get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type) @@ -578,8 +578,9 @@ def forward( prefill_meta.max_query_len, self.alibi_slopes, self.sliding_window, - layer._k_scale, - layer._v_scale, + layer._quant_group, + layer._k_scales, + layer._v_scales, ) assert output[:num_prefill_query_tokens].shape == out.shape output[:num_prefill_query_tokens] = out @@ -605,8 +606,9 @@ def forward( self.num_kv_heads, self.scale, self.alibi_slopes, - layer._k_scale, - layer._v_scale, + layer._quant_group, + layer._k_scales, + layer._v_scales, ) # Reshape the output tensor. diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index c36f8d08eb4a..122c0963e912 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -70,8 +70,21 @@ def __init__( # expect the pre-quantized k/v_scale to be loaded along # with the model weights. self.kv_cache_dtype = kv_cache_dtype - self._k_scale = 1.0 - self._v_scale = 1.0 + k_scales_lists = v_scales_lists = [1.0] + # k_scales_lists = [0.16] + # v_scales_lists = [0.005] + self._k_scales = torch.Tensor(k_scales_lists).type(torch.float32).to("cuda") + self._v_scales = torch.Tensor(v_scales_lists).type(torch.float32).to("cuda") + self._quant_group = cache_config.kv_quant_group + if cache_config.cache_dtype.startswith("int8"): + if cache_config.kv_quant_params_path is not None: + k_scales_lists = cache_config.kv_quant_params[0].pop(0) + v_scales_lists = cache_config.kv_quant_params[1].pop(0) + self._k_scales = torch.Tensor(k_scales_lists).type(torch.float32).to("cuda") + self._v_scales = torch.Tensor(v_scales_lists).type(torch.float32).to("cuda") + if self._quant_group !=0: + self._k_scales = self._k_scales.reshape((-1, num_kv_heads, head_size//self._quant_group)) + self._v_scales = self._v_scales.reshape((-1, num_kv_heads, head_size//self._quant_group)) quant_method = quant_config.get_quant_method( self, prefix=prefix) if quant_config else None if quant_method is not None: @@ -135,6 +148,7 @@ def forward( kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, ) -> torch.Tensor: + if self.use_output: output = torch.empty_like(query) hidden_size = query.size(-1) diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index 076f151ffcb6..e6ea368318fb 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -69,8 +69,9 @@ def write_to_paged_cache( value_cache: torch.Tensor, slot_mapping: torch.Tensor, kv_cache_dtype: str, - k_scale: float, - v_scale: float, + quant_group: Optional[int], + k_scales: torch.Tensor, + v_scales: torch.Tensor, ) -> None: ops.reshape_and_cache( key, @@ -79,8 +80,9 @@ def write_to_paged_cache( value_cache, slot_mapping.flatten(), kv_cache_dtype, - k_scale, - v_scale, + quant_group, + k_scales, + v_scales, ) @staticmethod @@ -95,8 +97,9 @@ def forward_decode( num_kv_heads: int, scale: float, alibi_slopes: Optional[torch.Tensor], - k_scale: float, - v_scale: float, + quant_group: Optional[int], + k_scales: torch.Tensor, + v_scales: torch.Tensor, tp_rank: int = 0, blocksparse_local_blocks: int = 0, blocksparse_vert_stride: int = 0, @@ -141,8 +144,9 @@ def forward_decode( max_seq_len, alibi_slopes, kv_cache_dtype, - k_scale, - v_scale, + quant_group, + k_scales, + v_scales, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride, @@ -179,8 +183,9 @@ def forward_decode( max_seq_len, alibi_slopes, kv_cache_dtype, - k_scale, - v_scale, + quant_group, + k_scales, + v_scales, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride, @@ -204,8 +209,9 @@ def forward_prefix( max_query_len: int, alibi_slopes: Optional[torch.Tensor], sliding_window: Optional[int], - k_scale: float, - v_scale: float, + quant_group: Optional[int], + k_scales: torch.Tensor, + v_scales: torch.Tensor, ) -> torch.Tensor: output = torch.empty_like(query) context_attention_fwd( @@ -222,8 +228,9 @@ def forward_prefix( seq_lens_tensor, context_lens, max_query_len, - k_scale, - v_scale, + quant_group, + k_scales, + v_scales, alibi_slopes, sliding_window, ) diff --git a/vllm/config.py b/vllm/config.py index 69577505fc9b..5f80f15cb9f5 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -997,6 +997,9 @@ def __init__( gpu_memory_utilization: float, swap_space: float, cache_dtype: str, + kv_quant_group: Optional[int] = None, + kv_quant_params: Optional[list[float]] = None, + kv_quant_params_path: Optional[str] = None, is_attention_free: bool = False, num_gpu_blocks_override: Optional[int] = None, sliding_window: Optional[int] = None, @@ -1008,6 +1011,9 @@ def __init__( self.swap_space_bytes = swap_space * GiB_bytes self.num_gpu_blocks_override = num_gpu_blocks_override self.cache_dtype = cache_dtype + self.kv_quant_group = kv_quant_group + self.kv_quant_params = kv_quant_params + self.kv_quant_params_path = kv_quant_params_path self.is_attention_free = is_attention_free self.sliding_window = sliding_window self.enable_prefix_caching = enable_prefix_caching @@ -1041,6 +1047,12 @@ def _verify_cache_dtype(self) -> None: "memory footprint and boosts the performance. " "Meanwhile, it may cause accuracy drop without a proper " "scaling factor") + elif self.cache_dtype in ("int8", "int8_group0", "int8_group128"): + logger.info( + "Using int8 data type to store kv cache. It reduces the GPU " + "memory footprint and boosts the performance. " + "Meanwhile, it may cause accuracy drop without a proper " + "scaling factor") else: raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ba58614bf8f9..a9bd9facdaad 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -98,6 +98,9 @@ class EngineArgs: config_format: ConfigFormat = ConfigFormat.AUTO dtype: str = 'auto' kv_cache_dtype: str = 'auto' + kv_quant_group: Optional[int] = 0 + kv_quant_params: Optional[list[float]] = None + kv_quant_params_path: Optional[str] = None quantization_param_path: Optional[str] = None seed: int = 0 max_model_len: Optional[int] = None @@ -345,11 +348,28 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument( '--kv-cache-dtype', type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], + choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3', 'int8'], default=EngineArgs.kv_cache_dtype, help='Data type for kv cache storage. If "auto", will use model ' 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') + parser.add_argument( + '--kv-quant-group', + type=int, + default=EngineArgs.kv_quant_group, + help='kv cache quantizaiton group when kv cache dtype is int8.') + parser.add_argument( + '--kv-quant-params-path', + type=nullable_str, + default=EngineArgs.kv_quant_params_path, + help='Path to scales and zero points of kv cache quantizaiton ' + 'when kv cache dtype is int8.') + parser.add_argument( + '--kv-quant-params', + type=nullable_str, + default=EngineArgs.kv_quant_params, + help='scales and zero points of kv cache quantizaiton ' + 'when kv cache dtype is int8.') parser.add_argument( '--quantization-param-path', type=nullable_str, @@ -1063,6 +1083,9 @@ def create_engine_config(self, gpu_memory_utilization=self.gpu_memory_utilization, swap_space=self.swap_space, cache_dtype=self.kv_cache_dtype, + kv_quant_params = self.kv_quant_params, + kv_quant_params_path = self.kv_quant_params_path, + kv_quant_group = self.kv_quant_group, is_attention_free=model_config.is_attention_free, num_gpu_blocks_override=self.num_gpu_blocks_override, sliding_window=model_config.get_sliding_window(), diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 2587e3a11dde..365839717a13 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -189,6 +189,15 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype, "better performance by setting environment variable " "VLLM_ATTENTION_BACKEND=FLASHINFER") target_backend = _Backend.XFORMERS + elif kv_cache_dtype is not None and \ + kv_cache_dtype.startswith("int8"): + logger.info( + "Cannot use FlashAttention-2 backend for INT8 KV cache.") + logger.warning( + "Please use FlashInfer backend with INT8 KV Cache for " + "better performance by setting environment variable " + "VLLM_ATTENTION_BACKEND=FLASHINFER") + target_backend = _Backend.XFORMERS elif block_size % 16 != 0: logger.info( "Cannot use FlashAttention-2 backend for block size not " diff --git a/vllm/utils.py b/vllm/utils.py index 17bffd2846b4..309804909199 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -146,6 +146,9 @@ "half": torch.half, "bfloat16": torch.bfloat16, "float": torch.float, + "int8": torch.uint8, + "int8_group0": torch.uint8, + "int8_group128": torch.uint8, "fp8": torch.uint8, "fp8_e4m3": torch.uint8, "fp8_e5m2": torch.uint8, @@ -581,6 +584,11 @@ def _generate_random_fp8( del tensor_tmp +def _generate_random_int8( + tensor: torch.Tensor, +) -> None: + tensor = torch.randint(-128, 128, tensor.size()) + def get_kv_cache_torch_dtype( cache_dtype: Optional[Union[str, torch.dtype]], model_dtype: Optional[Union[str, torch.dtype]] = None) -> torch.dtype: @@ -596,6 +604,8 @@ def get_kv_cache_torch_dtype( torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] elif cache_dtype == "fp8": torch_dtype = torch.uint8 + elif cache_dtype.startswith("int8"): + torch_dtype = torch.uint8 else: raise ValueError(f"Invalid kv cache dtype: {cache_dtype}") elif isinstance(cache_dtype, torch.dtype): @@ -634,6 +644,8 @@ def create_kv_caches_with_random_flash( key_value_cache.uniform_(-scale, scale) elif cache_dtype == 'fp8': _generate_random_fp8(key_value_cache, -scale, scale) + elif cache_dtype == 'int8': + _generate_random_int8(key_value_cache) else: raise ValueError( f"Does not support key cache of type {cache_dtype}") @@ -658,6 +670,11 @@ def create_kv_caches_with_random( raise ValueError( f"Does not support key cache of type fp8 with head_size {head_size}" ) + if cache_dtype.startswith("int8") and head_size % 16: + raise ValueError( + f"Does not support key cache of type int8 with head_size {head_size}" + ) + from vllm.platforms import current_platform current_platform.seed_everything(seed) @@ -675,6 +692,8 @@ def create_kv_caches_with_random( key_cache.uniform_(-scale, scale) elif cache_dtype == 'fp8': _generate_random_fp8(key_cache, -scale, scale) + elif cache_dtype == 'int8': + _generate_random_int8(key_cache) else: raise ValueError( f"Does not support key cache of type {cache_dtype}") @@ -690,6 +709,8 @@ def create_kv_caches_with_random( value_cache.uniform_(-scale, scale) elif cache_dtype == 'fp8': _generate_random_fp8(value_cache, -scale, scale) + elif cache_dtype == 'int8': + _generate_random_int8(value_cache) else: raise ValueError( f"Does not support value cache of type {cache_dtype}") diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index fd36ea8d8806..fd88b14483e6 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -150,7 +150,8 @@ def forward( shape = [num_tokens, num_heads * head_size] """ # NOTE(woosuk): FlashAttention does not support FP8 KV cache. - assert layer._k_scale == 1.0 and layer._v_scale == 1.0, ( + # assert layer._k_scales[0] == 1.0 and layer._v_scales[0] == 1.0, ( + assert layer._k_scales.shape != torch.Size([]) and layer._v_scales.shape != torch.Size([]), ( "key/v_scale is not supported in FlashAttention.") assert output is not None, "Output tensor must be provided." @@ -182,8 +183,9 @@ def forward( value_cache, attn_metadata.slot_mapping, self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, + layer._quant_group, + layer._k_scales, + layer._v_scales, ) # Compute attention and update output up to `num_actual_tokens`. diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index e311c14111d4..58877db269f9 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -14,6 +14,7 @@ import torch import torch.distributed import torch.nn as nn +import json from tqdm import tqdm import vllm.envs as envs @@ -1023,6 +1024,10 @@ def __init__( self.pin_memory = is_pin_memory_available() self.kv_cache_dtype = kv_cache_dtype + self.kv_quant_params = self.load_kv_quant_params( + model_config, self.cache_config.kv_quant_params_path + ) if self.kv_cache_dtype.startswith("int8") else None + self.cache_config.kv_quant_params = self.kv_quant_params self.sliding_window = model_config.get_sliding_window() self.block_size = cache_config.block_size self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture @@ -1100,11 +1105,85 @@ def __init__( self.sampling_metadata_cache: SamplingMetadataCache = \ SamplingMetadataCache() \ if self.parallel_config.pipeline_parallel_size == 1 else None - if hasattr(self, "_builder_cls"): # multi-step model runner does not have `_builder_cls` self.builder = self._builder_cls(weakref.proxy(self)) + def load_kv_quant_params(self, model_config, + kv_quant_params_path: str) -> List[List[float]]: + if model_config is None: + return None + # Remove it when all models support kv cache int8. + architectures = model_config.hf_config.architectures + for arch in architectures: + if arch not in ["LlamaForCausalLM", "LLaMAForCausalLM","ChatGLMModel"]: + raise ValueError( + "KV CACHE INT8 is not supported for model " + f"architectures {arch} for now. Supported architectures: " + "LlamaForCausalLM, LLaMAForCausalLM.") + num_layers = model_config.hf_config.num_hidden_layers + kv_quant_params = [] + if kv_quant_params_path is not None: + k_scale: Dict[int, Dict[int, float]] + v_scale: Dict[int, Dict[int, float]] + k_zero_point: Dict[int, Dict[int, float]] + v_zero_point: Dict[int, Dict[int, float]] + with open(kv_quant_params_path) as f: + context = { + "model_type": model_config.hf_text_config.model_type, + "num_hidden_layers": num_layers, + } + schema_dct = json.load(f) + if context: + model_type = context.get("model_type", None) + model_type_schema = schema_dct["model_type"] + if model_type is not None: + assert model_type == schema_dct["model_type"], ( + f"Model type is {model_type} but loaded " + f"scaling factors belonging to different " + f"model type {model_type_schema}!") + k_scale = schema_dct["kv_cache"]["scaling_factor"]["k_scale"] + v_scale = schema_dct["kv_cache"]["scaling_factor"]["v_scale"] + k_zero_point = schema_dct["kv_cache"]["scaling_factor"]["k_zero_point"] + v_zero_point = schema_dct["kv_cache"]["scaling_factor"]["v_zero_point"] + if type(k_scale["0"]) == float: + k_scale_param = list(k_scale.values()) + kv_quant_params.append(k_scale_param) + v_scale_param = list(v_scale.values()) + kv_quant_params.append(v_scale_param) + k_zero_point_param = list(k_zero_point.values()) + kv_quant_params.append(k_zero_point_param) + v_zero_point_param = list(v_zero_point.values()) + kv_quant_params.append(v_zero_point_param) + elif type(k_scale["0"]) == dict: + k_scale_param = [] + for key in k_scale: + k_scale_param.append(list(k_scale[key].values())) + # for n in list(k_scale[key].values()): + # k_scale_param.append(n) + # print("k_scale_param ", k_scale_param) + kv_quant_params.append(k_scale_param) + v_scale_param = [] + for key in v_scale: + v_scale_param.append(list(v_scale[key].values())) + # for n in list(v_scale[key].values()): + # v_scale_param.append(n) + kv_quant_params.append(v_scale_param) + k_zero_point_param = [] + for key in k_zero_point: + k_zero_point_param.append(list(k_zero_point[key].values())) + # for n in list(k_zero_point[key].values()): + # k_zero_point_param.append(n) + kv_quant_params.append(k_zero_point_param) + v_zero_point_param = [] + for key in v_zero_point: + v_zero_point_param.append(list(v_zero_point[key].values())) + # for n in list(v_zero_point[key].values()): + # v_zero_point_param.append(n) + kv_quant_params.append(v_zero_point_param) + # print("kv_quant_params ", len(kv_quant_params)) + return kv_quant_params + def load_model(self) -> None: logger.info("Starting to load model %s...", self.model_config.model) with DeviceMemoryProfiler() as m: @@ -1179,6 +1258,34 @@ def load_model(self) -> None: "provided. Defaulting to scaling factors of 1.0. " "This may lead to less accurate results!") + if self.kv_cache_dtype.startswith("int8") and current_platform.is_rocm(): + # Currently only ROCm accepts kv-cache scaling factors + # via quantization_param_path and this will be deprecated + # in the future. + if self.model_config.quantization_param_path is not None: + if callable(getattr(self.model, "load_kv_cache_scales", None)): + warnings.warn( + "Loading kv cache scaling factor from JSON is " + "deprecated and will be removed. Please include " + "kv cache scaling factors in the model checkpoint.", + FutureWarning, + stacklevel=2) + self.model.load_kv_cache_scales( + self.model_config.quantization_param_path) + logger.info("Loaded KV cache scaling factors from %s", + self.model_config.quantization_param_path) + else: + raise RuntimeError( + "Using int8 KV cache and scaling factors provided but " + "model %s does not support loading scaling factors.", + self.model.__class__) + else: + logger.warning( + "Using int8 KV cache but no scaling factors " + "provided. Defaulting to scaling factors of 1.0. " + "This may lead to less accurate results!") + + if self.vllm_config.compilation_config.level ==\ CompilationLevel.DYNAMO_AS_IS and supports_dynamo(): backend = self.vllm_config.compilation_config.init_backend(