diff --git a/onnxruntime/core/framework/print_tensor_statistics_utils.h b/onnxruntime/core/framework/print_tensor_statistics_utils.h index 64d60e048a112..0f524f231f13d 100644 --- a/onnxruntime/core/framework/print_tensor_statistics_utils.h +++ b/onnxruntime/core/framework/print_tensor_statistics_utils.h @@ -4,6 +4,7 @@ #include #include "core/framework/print_tensor_utils.h" +#include "core/framework/int2.h" namespace onnxruntime { namespace utils { @@ -94,36 +95,38 @@ void PrintCommonStats(const T* data, size_t count, TensorStatisticsData& tensor_ } } -#define DEF_PRINT_COMMON_STATS_4BIT(FOUR_BIT_TYPE) \ - template <> \ - inline void PrintCommonStats( \ - const FOUR_BIT_TYPE* data, size_t count, TensorStatisticsData&) { \ - using UnpackedType = typename FOUR_BIT_TYPE::UnpackedType; \ - UnpackedType min = data[0].GetElem(0); \ - UnpackedType max = min; \ - for (size_t i = 1; i < count; i++) { \ - auto indices = FOUR_BIT_TYPE::GetTensorElemIndices(i); \ - auto value = data[indices.first].GetElem(indices.second); \ - if (value > max) { \ - max = value; \ - } \ - if (value < min) { \ - min = value; \ - } \ - } \ - \ - std::cout << "Min="; \ - PrintValue(min); \ - \ - std::cout << ",Max="; \ - PrintValue(max); \ +#define DEF_PRINT_COMMON_STATS_PACKED(PACKED_TYPE) \ + template <> \ + inline void PrintCommonStats( \ + const PACKED_TYPE* data, size_t count, TensorStatisticsData&) { \ + using UnpackedType = typename PACKED_TYPE::UnpackedType; \ + UnpackedType min = data[0].GetElem(0); \ + UnpackedType max = min; \ + for (size_t i = 1; i < count; i++) { \ + auto indices = PACKED_TYPE::GetTensorElemIndices(i); \ + auto value = data[indices.first].GetElem(indices.second); \ + if (value > max) { \ + max = value; \ + } \ + if (value < min) { \ + min = value; \ + } \ + } \ + \ + std::cout << "Min="; \ + PrintValue(min); \ + \ + std::cout << ",Max="; \ + PrintValue(max); \ } -DEF_PRINT_COMMON_STATS_4BIT(Int4x2) -DEF_PRINT_COMMON_STATS_4BIT(UInt4x2) +DEF_PRINT_COMMON_STATS_PACKED(Int4x2) +DEF_PRINT_COMMON_STATS_PACKED(UInt4x2) #if !defined(DISABLE_FLOAT4_TYPES) -DEF_PRINT_COMMON_STATS_4BIT(Float4E2M1x2) +DEF_PRINT_COMMON_STATS_PACKED(Float4E2M1x2) #endif +DEF_PRINT_COMMON_STATS_PACKED(Int2x4) +DEF_PRINT_COMMON_STATS_PACKED(UInt2x4) template void PrintHalfStats(const T* data, size_t count) { diff --git a/onnxruntime/core/framework/print_tensor_utils.h b/onnxruntime/core/framework/print_tensor_utils.h index 47be8b8dc2057..0c0f9e2a13cbb 100644 --- a/onnxruntime/core/framework/print_tensor_utils.h +++ b/onnxruntime/core/framework/print_tensor_utils.h @@ -5,6 +5,7 @@ #include #include #include +#include "core/framework/int2.h" namespace onnxruntime { namespace utils { @@ -74,31 +75,33 @@ void PrintCpuTensorSnippet(const T* tensor, int64_t dim0, int64_t dim1, int64_t std::cout << std::endl; } -// 4 BIT TYPE - Print snippet of 2D tensor with shape (dim0, dim1) -#define DEF_PRINT_CPU_TENSOR_SNIPPET_2D_4BIT(FOUR_BIT_TYPE) \ - template <> \ - inline void PrintCpuTensorSnippet(const FOUR_BIT_TYPE* tensor, int64_t dim0, int64_t dim1, \ - int64_t edge_items) { \ - for (int64_t i = 0; i < dim0; i++) { \ - SKIP_NON_EDGE_ITEMS(dim0, i, edge_items); \ - auto indices = FOUR_BIT_TYPE::GetTensorElemIndices(static_cast(i * dim1)); \ - PrintValue(tensor[indices.first].GetElem(indices.second)); \ - for (int64_t j = 1; j < dim1; j++) { \ - SKIP_NON_EDGE_ITEMS_LAST_DIM(dim1, j, edge_items); \ - std::cout << ", "; \ - indices = FOUR_BIT_TYPE::GetTensorElemIndices(static_cast(i * dim1 + j)); \ - PrintValue(tensor[indices.first].GetElem(indices.second)); \ - } \ - std::cout << std::endl; \ - } \ - std::cout << std::endl; \ +// PACKED TYPE - Print snippet of 2D tensor with shape (dim0, dim1) +#define DEF_PRINT_CPU_TENSOR_SNIPPET_2D_PACKED(PACKED_TYPE) \ + template <> \ + inline void PrintCpuTensorSnippet(const PACKED_TYPE* tensor, int64_t dim0, int64_t dim1, \ + int64_t edge_items) { \ + for (int64_t i = 0; i < dim0; i++) { \ + SKIP_NON_EDGE_ITEMS(dim0, i, edge_items); \ + auto indices = PACKED_TYPE::GetTensorElemIndices(static_cast(i * dim1)); \ + PrintValue(tensor[indices.first].GetElem(indices.second)); \ + for (int64_t j = 1; j < dim1; j++) { \ + SKIP_NON_EDGE_ITEMS_LAST_DIM(dim1, j, edge_items); \ + std::cout << ", "; \ + indices = PACKED_TYPE::GetTensorElemIndices(static_cast(i * dim1 + j)); \ + PrintValue(tensor[indices.first].GetElem(indices.second)); \ + } \ + std::cout << std::endl; \ + } \ + std::cout << std::endl; \ } -DEF_PRINT_CPU_TENSOR_SNIPPET_2D_4BIT(Int4x2) -DEF_PRINT_CPU_TENSOR_SNIPPET_2D_4BIT(UInt4x2) +DEF_PRINT_CPU_TENSOR_SNIPPET_2D_PACKED(Int4x2) +DEF_PRINT_CPU_TENSOR_SNIPPET_2D_PACKED(UInt4x2) #if !defined(DISABLE_FLOAT4_TYPES) -DEF_PRINT_CPU_TENSOR_SNIPPET_2D_4BIT(Float4E2M1x2) +DEF_PRINT_CPU_TENSOR_SNIPPET_2D_PACKED(Float4E2M1x2) #endif +DEF_PRINT_CPU_TENSOR_SNIPPET_2D_PACKED(Int2x4) +DEF_PRINT_CPU_TENSOR_SNIPPET_2D_PACKED(UInt2x4) // Print snippet of 3D tensor with shape (dim0, dim1, dim2) template @@ -120,35 +123,37 @@ void PrintCpuTensorSnippet(const T* tensor, int64_t dim0, int64_t dim1, int64_t std::cout << std::endl; } -// 4 BIT TYPE - Print snippet of 3D tensor with shape (dim0, dim1, dim2) -#define DEF_PRINT_CPU_TENSOR_SNIPPET_3D_4BIT(FOUR_BIT_TYPE) \ - template <> \ - inline void PrintCpuTensorSnippet(const FOUR_BIT_TYPE* tensor, int64_t dim0, int64_t dim1, int64_t dim2, \ - int64_t edge_items) { \ - for (int64_t i = 0; i < dim0; i++) { \ - SKIP_NON_EDGE_ITEMS(dim0, i, edge_items); \ - for (int64_t j = 0; j < dim1; j++) { \ - SKIP_NON_EDGE_ITEMS(dim1, j, edge_items); \ - auto indices = FOUR_BIT_TYPE::GetTensorElemIndices(static_cast(i * dim1 * dim2 + j * dim2)); \ - PrintValue(tensor[indices.first].GetElem(indices.second)); \ - for (int64_t k = 1; k < dim2; k++) { \ - SKIP_NON_EDGE_ITEMS_LAST_DIM(dim2, k, edge_items); \ - std::cout << ", "; \ - indices = FOUR_BIT_TYPE::GetTensorElemIndices(static_cast(i * dim1 * dim2 + j * dim2 + k)); \ - PrintValue(tensor[indices.first].GetElem(indices.second)); \ - } \ - std::cout << std::endl; \ - } \ - std::cout << std::endl; \ - } \ - std::cout << std::endl; \ +// PACKED TYPE - Print snippet of 3D tensor with shape (dim0, dim1, dim2) +#define DEF_PRINT_CPU_TENSOR_SNIPPET_3D_PACKED(PACKED_TYPE) \ + template <> \ + inline void PrintCpuTensorSnippet(const PACKED_TYPE* tensor, int64_t dim0, int64_t dim1, int64_t dim2, \ + int64_t edge_items) { \ + for (int64_t i = 0; i < dim0; i++) { \ + SKIP_NON_EDGE_ITEMS(dim0, i, edge_items); \ + for (int64_t j = 0; j < dim1; j++) { \ + SKIP_NON_EDGE_ITEMS(dim1, j, edge_items); \ + auto indices = PACKED_TYPE::GetTensorElemIndices(static_cast(i * dim1 * dim2 + j * dim2)); \ + PrintValue(tensor[indices.first].GetElem(indices.second)); \ + for (int64_t k = 1; k < dim2; k++) { \ + SKIP_NON_EDGE_ITEMS_LAST_DIM(dim2, k, edge_items); \ + std::cout << ", "; \ + indices = PACKED_TYPE::GetTensorElemIndices(static_cast(i * dim1 * dim2 + j * dim2 + k)); \ + PrintValue(tensor[indices.first].GetElem(indices.second)); \ + } \ + std::cout << std::endl; \ + } \ + std::cout << std::endl; \ + } \ + std::cout << std::endl; \ } -DEF_PRINT_CPU_TENSOR_SNIPPET_3D_4BIT(Int4x2) -DEF_PRINT_CPU_TENSOR_SNIPPET_3D_4BIT(UInt4x2) +DEF_PRINT_CPU_TENSOR_SNIPPET_3D_PACKED(Int4x2) +DEF_PRINT_CPU_TENSOR_SNIPPET_3D_PACKED(UInt4x2) #if !defined(DISABLE_FLOAT4_TYPES) -DEF_PRINT_CPU_TENSOR_SNIPPET_3D_4BIT(Float4E2M1x2) +DEF_PRINT_CPU_TENSOR_SNIPPET_3D_PACKED(Float4E2M1x2) #endif +DEF_PRINT_CPU_TENSOR_SNIPPET_3D_PACKED(Int2x4) +DEF_PRINT_CPU_TENSOR_SNIPPET_3D_PACKED(UInt2x4) // Print 2D tensor template @@ -164,28 +169,30 @@ void PrintCpuTensorFull(const T* tensor, int64_t dim0, int64_t dim1) { std::cout << std::endl; } -// 4 BIT TYPE - Print 2D tensor -#define DEF_PRINT_CPU_TENSOR_FULL_2D_4BIT(FOUR_BIT_TYPE) \ - template <> \ - inline void PrintCpuTensorFull(const FOUR_BIT_TYPE* tensor, int64_t dim0, int64_t dim1) { \ - for (int64_t i = 0; i < dim0; i++) { \ - auto indices = FOUR_BIT_TYPE::GetTensorElemIndices(static_cast(i * dim1)); \ - PrintValue(tensor[indices.first].GetElem(indices.second)); \ - for (int64_t j = 1; j < dim1; j++) { \ - std::cout << ", "; \ - indices = FOUR_BIT_TYPE::GetTensorElemIndices(static_cast(i * dim1 + j)); \ - PrintValue(tensor[indices.first].GetElem(indices.second)); \ - } \ - std::cout << std::endl; \ - } \ - std::cout << std::endl; \ +// PACKED TYPE - Print 2D tensor +#define DEF_PRINT_CPU_TENSOR_FULL_2D_PACKED(PACKED_TYPE) \ + template <> \ + inline void PrintCpuTensorFull(const PACKED_TYPE* tensor, int64_t dim0, int64_t dim1) { \ + for (int64_t i = 0; i < dim0; i++) { \ + auto indices = PACKED_TYPE::GetTensorElemIndices(static_cast(i * dim1)); \ + PrintValue(tensor[indices.first].GetElem(indices.second)); \ + for (int64_t j = 1; j < dim1; j++) { \ + std::cout << ", "; \ + indices = PACKED_TYPE::GetTensorElemIndices(static_cast(i * dim1 + j)); \ + PrintValue(tensor[indices.first].GetElem(indices.second)); \ + } \ + std::cout << std::endl; \ + } \ + std::cout << std::endl; \ } -DEF_PRINT_CPU_TENSOR_FULL_2D_4BIT(Int4x2) -DEF_PRINT_CPU_TENSOR_FULL_2D_4BIT(UInt4x2) +DEF_PRINT_CPU_TENSOR_FULL_2D_PACKED(Int4x2) +DEF_PRINT_CPU_TENSOR_FULL_2D_PACKED(UInt4x2) #if !defined(DISABLE_FLOAT4_TYPES) -DEF_PRINT_CPU_TENSOR_FULL_2D_4BIT(Float4E2M1x2) +DEF_PRINT_CPU_TENSOR_FULL_2D_PACKED(Float4E2M1x2) #endif +DEF_PRINT_CPU_TENSOR_FULL_2D_PACKED(Int2x4) +DEF_PRINT_CPU_TENSOR_FULL_2D_PACKED(UInt2x4) // Print 3D tensor template @@ -204,31 +211,33 @@ void PrintCpuTensorFull(const T* tensor, int64_t dim0, int64_t dim1, int64_t dim std::cout << std::endl; } -// 4 BIT TYPE - Print 3D tensor -#define DEF_PRINT_CPU_TENSOR_FULL_3D_4BIT(FOUR_BIT_TYPE) \ - template <> \ - inline void PrintCpuTensorFull(const FOUR_BIT_TYPE* tensor, int64_t dim0, int64_t dim1, int64_t dim2) { \ - for (int64_t i = 0; i < dim0; i++) { \ - for (int64_t j = 0; j < dim1; j++) { \ - auto indices = FOUR_BIT_TYPE::GetTensorElemIndices(static_cast(i * dim1 * dim2 + j * dim2)); \ - PrintValue(tensor[indices.first].GetElem(indices.second)); \ - for (int64_t k = 1; k < dim2; k++) { \ - std::cout << ", "; \ - indices = FOUR_BIT_TYPE::GetTensorElemIndices(static_cast(i * dim1 * dim2 + j * dim2 + k)); \ - PrintValue(tensor[indices.first].GetElem(indices.second)); \ - } \ - std::cout << std::endl; \ - } \ - std::cout << std::endl; \ - } \ - std::cout << std::endl; \ +// PACKED TYPE - Print 3D tensor +#define DEF_PRINT_CPU_TENSOR_FULL_3D_PACKED(PACKED_TYPE) \ + template <> \ + inline void PrintCpuTensorFull(const PACKED_TYPE* tensor, int64_t dim0, int64_t dim1, int64_t dim2) { \ + for (int64_t i = 0; i < dim0; i++) { \ + for (int64_t j = 0; j < dim1; j++) { \ + auto indices = PACKED_TYPE::GetTensorElemIndices(static_cast(i * dim1 * dim2 + j * dim2)); \ + PrintValue(tensor[indices.first].GetElem(indices.second)); \ + for (int64_t k = 1; k < dim2; k++) { \ + std::cout << ", "; \ + indices = PACKED_TYPE::GetTensorElemIndices(static_cast(i * dim1 * dim2 + j * dim2 + k)); \ + PrintValue(tensor[indices.first].GetElem(indices.second)); \ + } \ + std::cout << std::endl; \ + } \ + std::cout << std::endl; \ + } \ + std::cout << std::endl; \ } -DEF_PRINT_CPU_TENSOR_FULL_3D_4BIT(Int4x2) -DEF_PRINT_CPU_TENSOR_FULL_3D_4BIT(UInt4x2) +DEF_PRINT_CPU_TENSOR_FULL_3D_PACKED(Int4x2) +DEF_PRINT_CPU_TENSOR_FULL_3D_PACKED(UInt4x2) #if !defined(DISABLE_FLOAT4_TYPES) -DEF_PRINT_CPU_TENSOR_FULL_3D_4BIT(Float4E2M1x2) +DEF_PRINT_CPU_TENSOR_FULL_3D_PACKED(Float4E2M1x2) #endif +DEF_PRINT_CPU_TENSOR_FULL_3D_PACKED(Int2x4) +DEF_PRINT_CPU_TENSOR_FULL_3D_PACKED(UInt2x4) template void PrintCpuTensor(const onnxruntime::Tensor& tensor,