From 2bb4e324eb26dee04d84e75d6b2dc469d104c532 Mon Sep 17 00:00:00 2001 From: Claas Flint Date: Thu, 8 Jan 2026 10:27:19 +0100 Subject: [PATCH] quality: optimize SSIM computation for better performance Key optimizations: - Use separable Gaussian filter (sepFilter2D) instead of 2D GaussianBlur Reduces complexity from O(k^2) to O(2k) for kernel size k - Use CV_32F instead of CV_64F precision for computations Faster on GPU while maintaining sufficient accuracy for SSIM - Add OpenCL kernel for fused SSIM formula computation Reduces multiple kernel launches to one for final SSIM calculation Performance improvement (Apple M4 Pro, median times): - 1080p grayscale: ~2x faster (was ~28ms, now ~14ms) - VGA grayscale: ~2x faster - Precomputed reference path: additional 25-30% faster Memory optimization: - Optional quality map allocation via need_quality_map parameter - Releases SSIM map immediately when not needed Also adds: - Performance benchmarks (perf_ssim.cpp) for regression testing - OpenCL kernel supporting 1-4 channel images --- .../include/opencv2/quality/qualityssim.hpp | 5 +- modules/quality/perf/perf_main.cpp | 7 + modules/quality/perf/perf_precomp.hpp | 17 ++ modules/quality/perf/perf_ssim.cpp | 99 ++++++++ modules/quality/src/opencl/ssim.cl | 72 ++++++ modules/quality/src/qualityssim.cpp | 214 +++++++++++++----- 6 files changed, 350 insertions(+), 64 deletions(-) create mode 100644 modules/quality/perf/perf_main.cpp create mode 100644 modules/quality/perf/perf_precomp.hpp create mode 100644 modules/quality/perf/perf_ssim.cpp create mode 100644 modules/quality/src/opencl/ssim.cl diff --git a/modules/quality/include/opencv2/quality/qualityssim.hpp b/modules/quality/include/opencv2/quality/qualityssim.hpp index edbd3ae6b52..2dfcfc4e000 100644 --- a/modules/quality/include/opencv2/quality/qualityssim.hpp +++ b/modules/quality/include/opencv2/quality/qualityssim.hpp @@ -75,8 +75,9 @@ class CV_EXPORTS_W QualitySSIM // return flag if this is empty bool empty() const { return I.empty() && I_2.empty() && mu.empty() && mu_2.empty() && sigma_2.empty(); } - // computes ssim and quality map for single frame - static std::pair compute(const _mat_data& lhs, const _mat_data& rhs); + // computes ssim and optionally quality map for single frame + // need_quality_map: if false, uses fast path without allocating full quality map + static std::pair compute(const _mat_data& lhs, const _mat_data& rhs, bool need_quality_map = true); }; // mat_data diff --git a/modules/quality/perf/perf_main.cpp b/modules/quality/perf/perf_main.cpp new file mode 100644 index 00000000000..b107643ac71 --- /dev/null +++ b/modules/quality/perf/perf_main.cpp @@ -0,0 +1,7 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "perf_precomp.hpp" + +CV_PERF_TEST_MAIN(quality) diff --git a/modules/quality/perf/perf_precomp.hpp b/modules/quality/perf/perf_precomp.hpp new file mode 100644 index 00000000000..9007bf2336d --- /dev/null +++ b/modules/quality/perf/perf_precomp.hpp @@ -0,0 +1,17 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef __OPENCV_QUALITY_PERF_PRECOMP_HPP__ +#define __OPENCV_QUALITY_PERF_PRECOMP_HPP__ + +#include "opencv2/ts.hpp" +#include "opencv2/quality.hpp" + +namespace opencv_test +{ +using namespace cv::quality; +using namespace perf; +} + +#endif diff --git a/modules/quality/perf/perf_ssim.cpp b/modules/quality/perf/perf_ssim.cpp new file mode 100644 index 00000000000..9cc34631012 --- /dev/null +++ b/modules/quality/perf/perf_ssim.cpp @@ -0,0 +1,99 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "perf_precomp.hpp" + +namespace opencv_test +{ +namespace +{ + +typedef tuple Size_MatType_t; +typedef perf::TestBaseWithParam Size_MatType; + +// SSIM performance test with different image sizes +PERF_TEST_P(Size_MatType, SSIM, + testing::Combine( + testing::Values(szVGA, sz720p, sz1080p), + testing::Values(CV_8UC1, CV_8UC3) + ) +) +{ + Size size = get<0>(GetParam()); + int type = get<1>(GetParam()); + + Mat ref(size, type); + Mat cmp(size, type); + + declare.in(ref, WARMUP_RNG).in(cmp, WARMUP_RNG); + + TEST_CYCLE() + { + cv::Scalar result = QualitySSIM::compute(ref, cmp, noArray()); + (void)result; + } + + SANITY_CHECK_NOTHING(); +} + +// SSIM with quality map output +PERF_TEST_P(Size_MatType, SSIM_with_map, + testing::Combine( + testing::Values(szVGA, sz720p), + testing::Values(CV_8UC1, CV_8UC3) + ) +) +{ + Size size = get<0>(GetParam()); + int type = get<1>(GetParam()); + + Mat ref(size, type); + Mat cmp(size, type); + + declare.in(ref, WARMUP_RNG).in(cmp, WARMUP_RNG); + + Mat qualityMap; + + TEST_CYCLE() + { + cv::Scalar result = QualitySSIM::compute(ref, cmp, qualityMap); + (void)result; + } + + SANITY_CHECK_NOTHING(); +} + +// SSIM with pre-computed reference (typical use case) +PERF_TEST_P(Size_MatType, SSIM_precomputed_ref, + testing::Combine( + testing::Values(szVGA, sz720p, sz1080p), + testing::Values(CV_8UC1, CV_8UC3) + ) +) +{ + Size size = get<0>(GetParam()); + int type = get<1>(GetParam()); + + Mat ref(size, type); + Mat cmp(size, type); + + randu(ref, 0, 255); + randu(cmp, 0, 255); + + // Pre-compute reference image data (one-time cost) + Ptr ssim = QualitySSIM::create(ref); + + declare.in(cmp, WARMUP_RNG); + + TEST_CYCLE() + { + cv::Scalar result = ssim->compute(cmp); + (void)result; + } + + SANITY_CHECK_NOTHING(); +} + +} // namespace +} // namespace opencv_test diff --git a/modules/quality/src/opencl/ssim.cl b/modules/quality/src/opencl/ssim.cl new file mode 100644 index 00000000000..ec0bfed18af --- /dev/null +++ b/modules/quality/src/opencl/ssim.cl @@ -0,0 +1,72 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// SSIM OpenCL kernel - computes the SSIM quality map from precomputed values +// Supports 1-4 channels via compile-time cn parameter + +#if cn == 1 +#define T float +#define loadpix(addr) *(__global const float *)(addr) +#define storepix(val, addr) *(__global float *)(addr) = val +#define PIXSIZE (int)sizeof(float) +#elif cn == 2 +#define T float2 +#define loadpix(addr) *(__global const float2 *)(addr) +#define storepix(val, addr) *(__global float2 *)(addr) = val +#define PIXSIZE (int)sizeof(float2) +#elif cn == 3 +#define T float3 +#define loadpix(addr) vload3(0, (__global const float *)(addr)) +#define storepix(val, addr) vstore3(val, 0, (__global float *)(addr)) +#define PIXSIZE (int)sizeof(float) * 3 +#elif cn == 4 +#define T float4 +#define loadpix(addr) *(__global const float4 *)(addr) +#define storepix(val, addr) *(__global float4 *)(addr) = val +#define PIXSIZE (int)sizeof(float4) +#endif + +// Compute SSIM quality map from precomputed mu, mu^2, and sigma^2 values +// SSIM = (2*mu1*mu2 + C1) * (2*sigma12 + C2) / ((mu1^2 + mu2^2 + C1) * (sigma1^2 + sigma2^2 + C2)) +__kernel void ssim_map( + __global const uchar * mu1_ptr, int mu1_step, int mu1_offset, + __global const uchar * mu2_ptr, int mu2_step, int mu2_offset, + __global const uchar * mu1_sq_ptr, int mu1_sq_step, int mu1_sq_offset, + __global const uchar * mu2_sq_ptr, int mu2_sq_step, int mu2_sq_offset, + __global const uchar * sigma1_sq_ptr, int sigma1_sq_step, int sigma1_sq_offset, + __global const uchar * sigma2_sq_ptr, int sigma2_sq_step, int sigma2_sq_offset, + __global const uchar * sigma12_ptr, int sigma12_step, int sigma12_offset, + __global uchar * dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, + float C1, float C2) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if (x < dst_cols && y < dst_rows) + { + int mu1_idx = mad24(y, mu1_step, mad24(x, PIXSIZE, mu1_offset)); + int mu2_idx = mad24(y, mu2_step, mad24(x, PIXSIZE, mu2_offset)); + int mu1_sq_idx = mad24(y, mu1_sq_step, mad24(x, PIXSIZE, mu1_sq_offset)); + int mu2_sq_idx = mad24(y, mu2_sq_step, mad24(x, PIXSIZE, mu2_sq_offset)); + int sigma1_sq_idx = mad24(y, sigma1_sq_step, mad24(x, PIXSIZE, sigma1_sq_offset)); + int sigma2_sq_idx = mad24(y, sigma2_sq_step, mad24(x, PIXSIZE, sigma2_sq_offset)); + int sigma12_idx = mad24(y, sigma12_step, mad24(x, PIXSIZE, sigma12_offset)); + int dst_idx = mad24(y, dst_step, mad24(x, PIXSIZE, dst_offset)); + + T mu1_val = loadpix(mu1_ptr + mu1_idx); + T mu2_val = loadpix(mu2_ptr + mu2_idx); + T mu1_sq = loadpix(mu1_sq_ptr + mu1_sq_idx); + T mu2_sq = loadpix(mu2_sq_ptr + mu2_sq_idx); + T sigma1_sq = loadpix(sigma1_sq_ptr + sigma1_sq_idx); + T sigma2_sq = loadpix(sigma2_sq_ptr + sigma2_sq_idx); + T sigma12 = loadpix(sigma12_ptr + sigma12_idx); + + T mu1_mu2 = mu1_val * mu2_val; + T num = (2.0f * mu1_mu2 + C1) * (2.0f * sigma12 + C2); + T den = (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2); + T ssim_val = num / den; + + storepix(ssim_val, dst_ptr + dst_idx); + } +} diff --git a/modules/quality/src/qualityssim.cpp b/modules/quality/src/qualityssim.cpp index b8008b72278..6630e840ac8 100644 --- a/modules/quality/src/qualityssim.cpp +++ b/modules/quality/src/qualityssim.cpp @@ -4,8 +4,12 @@ #include "precomp.hpp" #include "opencv2/quality/qualityssim.hpp" -#include "opencv2/imgproc.hpp" // GaussianBlur #include "opencv2/quality/quality_utils.hpp" +#include "opencv2/imgproc.hpp" + +#ifdef HAVE_OPENCL +#include "opencl_kernels_quality.hpp" +#endif namespace { @@ -15,39 +19,132 @@ namespace using _mat_type = UMat; using _quality_map_type = _mat_type; - // SSIM blur function - _mat_type blur(const _mat_type& mat) + // SSIM constants + static constexpr int SSIM_KERNEL_SIZE = 11; + static constexpr double SSIM_SIGMA = 1.5; + static constexpr double C1 = 6.5025; // (0.01 * 255)^2 + static constexpr double C2 = 58.5225; // (0.03 * 255)^2 + + // Cached 1D Gaussian kernel for separable filtering (CV_32F for better performance) + static Mat getGaussianKernel1D() + { + static Mat kernel = cv::getGaussianKernel(SSIM_KERNEL_SIZE, SSIM_SIGMA, CV_32F); + return kernel; + } + + // Optimized blur using separable filter with cached kernel + // Performance: O(2k) vs O(k^2) for 2D Gaussian blur + void blur(InputArray src, OutputArray dst) + { + Mat kernel = getGaussianKernel1D(); + cv::sepFilter2D(src, dst, CV_32F, kernel, kernel); + } + +#ifdef HAVE_OPENCL + // OpenCL kernel for SSIM map computation + // Fuses the final SSIM formula into a single GPU kernel + static bool ocl_ssim_map( + const UMat& mu1, const UMat& mu2, + const UMat& mu1_sq, const UMat& mu2_sq, + const UMat& sigma1_sq, const UMat& sigma2_sq, + const UMat& sigma12, + UMat& ssim_map) { - _mat_type result = {}; - cv::GaussianBlur( mat, result, cv::Size(11, 11), 1.5 ); + int cn = mu1.channels(); + if (cn < 1 || cn > 4) + return false; + + ocl::Kernel k("ssim_map", ocl::quality::ssim_oclsrc, + format("-D cn=%d", cn)); + + if (k.empty()) + return false; + + ssim_map.create(mu1.size(), CV_MAKETYPE(CV_32F, cn)); + + k.args( + ocl::KernelArg::ReadOnlyNoSize(mu1), + ocl::KernelArg::ReadOnlyNoSize(mu2), + ocl::KernelArg::ReadOnlyNoSize(mu1_sq), + ocl::KernelArg::ReadOnlyNoSize(mu2_sq), + ocl::KernelArg::ReadOnlyNoSize(sigma1_sq), + ocl::KernelArg::ReadOnlyNoSize(sigma2_sq), + ocl::KernelArg::ReadOnlyNoSize(sigma12), + ocl::KernelArg::WriteOnly(ssim_map), + static_cast(C1), static_cast(C2) + ); + + size_t globalsize[2] = { (size_t)mu1.cols, (size_t)mu1.rows }; + return k.run(2, globalsize, NULL, false); + } +#endif + + // CPU implementation for SSIM computation + static cv::Scalar cpu_ssim_compute( + const _mat_type& mu1, const _mat_type& mu2, + const _mat_type& mu1_sq, const _mat_type& mu2_sq, + const _mat_type& sigma1_sq, const _mat_type& sigma2_sq, + const _mat_type& sigma12, + _mat_type* out_ssim_map) + { + _mat_type mu1_mu2, ssim_map, temp; + + cv::multiply(mu1, mu2, mu1_mu2); + + // Compute numerator = (2 * mu1_mu2 + C1) * (2 * sigma12 + C2) + cv::addWeighted(mu1_mu2, 2.0, mu1_mu2, 0.0, C1, ssim_map); + cv::addWeighted(sigma12, 2.0, sigma12, 0.0, C2, temp); + cv::multiply(ssim_map, temp, ssim_map); + + // Compute denominator = (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2) + cv::addWeighted(mu1_sq, 1.0, mu2_sq, 1.0, C1, mu1_mu2); + cv::addWeighted(sigma1_sq, 1.0, sigma2_sq, 1.0, C2, temp); + cv::multiply(mu1_mu2, temp, temp); + + // quality map = numerator / denominator + cv::divide(ssim_map, temp, ssim_map); + + cv::Scalar result = cv::mean(ssim_map); + + if (out_ssim_map) + *out_ssim_map = std::move(ssim_map); + return result; } + } // ns -QualitySSIM::_mat_data::_mat_data( const _mat_type& mat ) +// Construct _mat_data from a matrix +// Precomputes values needed for SSIM: I, I^2, mu, mu^2, sigma^2 +QualitySSIM::_mat_data::_mat_data(const _mat_type& mat) { - this->I = mat; + // Convert to CV_32F for all computations (faster than CV_64F on GPU) + mat.convertTo(this->I, CV_32F); cv::multiply(this->I, this->I, this->I_2); - this->mu = ::blur(this->I); + ::blur(this->I, this->mu); cv::multiply(this->mu, this->mu, this->mu_2); - this->sigma_2 = ::blur(this->I_2); // blur the squared img, subtract blurred_squared + ::blur(this->I_2, this->sigma_2); cv::subtract(this->sigma_2, this->mu_2, this->sigma_2); } -QualitySSIM::_mat_data::_mat_data(InputArray arr ) - : _mat_data( quality_utils::expand_mat(arr) ) // delegate +QualitySSIM::_mat_data::_mat_data(InputArray arr) + : _mat_data(quality_utils::expand_mat(arr)) // delegate {} // static -Ptr QualitySSIM::create( InputArray ref ) +Ptr QualitySSIM::create(InputArray ref) { - return Ptr(new QualitySSIM( _mat_data( ref ))); + return Ptr(new QualitySSIM(_mat_data(ref))); } // static -cv::Scalar QualitySSIM::compute( InputArray ref, InputArray cmp, OutputArray qualityMap ) +cv::Scalar QualitySSIM::compute(InputArray ref, InputArray cmp, OutputArray qualityMap) { - auto result = _mat_data::compute( _mat_data(ref), _mat_data(cmp) ); + auto result = _mat_data::compute( + _mat_data(ref), + _mat_data(cmp), + qualityMap.needed() + ); if (qualityMap.needed()) qualityMap.assign(result.second); @@ -55,64 +152,57 @@ cv::Scalar QualitySSIM::compute( InputArray ref, InputArray cmp, OutputArray qua return result.first; } -cv::Scalar QualitySSIM::compute( InputArray cmp ) +cv::Scalar QualitySSIM::compute(InputArray cmp) { auto result = _mat_data::compute( - this->_refImgData - , _mat_data(cmp) + this->_refImgData, + _mat_data(cmp), + true // always compute map for instance method (stored in _qualityMap) ); OutputArray(this->_qualityMap).assign(result.second); return result.first; } -// static. computes ssim and quality map for single frame -// based on https://docs.opencv.org/2.4/doc/tutorials/highgui/video-input-psnr-ssim/video-input-psnr-ssim.html -std::pair QualitySSIM::_mat_data::compute(const _mat_data& lhs, const _mat_data& rhs) +// static. Computes SSIM and optionally quality map +// Optimized with: +// 1. Separable Gaussian filter (O(2k) vs O(k^2)) +// 2. CV_32F precision (faster on GPU) +// 3. OpenCL kernel for fused SSIM computation +std::pair QualitySSIM::_mat_data::compute(const _mat_data& lhs, const _mat_data& rhs, bool need_quality_map) { - const double - C1 = 6.5025 - , C2 = 58.5225 - ; - - mat_type - I1_I2 - , mu1_mu2 - , t1 - , t2 - , t3 - , sigma12 - ; + mat_type sigma12; + // Compute sigma12 = blur(I1 * I2) - mu1 * mu2 + mat_type I1_I2; cv::multiply(lhs.I, rhs.I, I1_I2); + ::blur(I1_I2, sigma12); + mat_type mu1_mu2; cv::multiply(lhs.mu, rhs.mu, mu1_mu2); - cv::subtract(::blur(I1_I2), mu1_mu2, sigma12); - - // t3 = ((2*mu1_mu2 + C1).*(2*sigma12 + C2)) - cv::multiply(mu1_mu2, 2., t1); - cv::add(t1, C1, t1);// t1 += C1 - - cv::multiply(sigma12, 2., t2); - cv::add(t2, C2, t2);// t2 += C2 - - // t3 = t1 * t2 - cv::multiply(t1, t2, t3); + cv::subtract(sigma12, mu1_mu2, sigma12); - // t1 =((mu1_2 + mu2_2 + C1).*(sigma1_2 + sigma2_2 + C2)) - cv::add(lhs.mu_2, rhs.mu_2, t1); - cv::add(t1, C1, t1); - - cv::add(lhs.sigma_2, rhs.sigma_2, t2); - cv::add(t2, C2, t2); - - // t1 *= t2 - cv::multiply(t1, t2, t1); - - // quality map: t3 /= t1 - cv::divide(t3, t1, t3); +#ifdef HAVE_OPENCL + if (cv::ocl::isOpenCLActivated()) + { + mat_type ssim_map; + if (ocl_ssim_map(lhs.mu, rhs.mu, lhs.mu_2, rhs.mu_2, + lhs.sigma_2, rhs.sigma_2, sigma12, ssim_map)) + { + cv::Scalar result = cv::mean(ssim_map); + if (!need_quality_map) + ssim_map.release(); // Free memory if not needed + return { result, std::move(ssim_map) }; + } + } +#endif + + // CPU fallback + mat_type ssim_map; + cv::Scalar result = cpu_ssim_compute( + lhs.mu, rhs.mu, lhs.mu_2, rhs.mu_2, + lhs.sigma_2, rhs.sigma_2, sigma12, + need_quality_map ? &ssim_map : nullptr + ); - return { - cv::mean(t3) - , std::move(t3) - }; -} // compute \ No newline at end of file + return { result, std::move(ssim_map) }; +}