From e17f4f80055009bf45b37e1b6d1790bd1c1a93b2 Mon Sep 17 00:00:00 2001 From: Pinterf Date: Mon, 17 Feb 2020 14:21:39 +0100 Subject: [PATCH] Fix: clamping ConvertBits 32->8 for extreme out-of-range pixel values --- avs_core/convert/convert.cpp | 11 ++++++----- avs_core/convert/convert_avx2.cpp | 13 +++++++------ distrib/Readme/readme.txt | 26 ++++++++++++++------------ distrib/Readme/readme_history.txt | 2 ++ 4 files changed, 29 insertions(+), 23 deletions(-) diff --git a/avs_core/convert/convert.cpp b/avs_core/convert/convert.cpp index 8caf41dee..b8b0b11dc 100644 --- a/avs_core/convert/convert.cpp +++ b/avs_core/convert/convert.cpp @@ -1592,7 +1592,6 @@ void convert_32_to_uintN_sse41(const BYTE *srcp8, BYTE *dstp8, int src_rowsize, int src_width = src_rowsize / sizeof(float); constexpr int max_pixel_value = (1 << targetbits) - 1; - const __m128i max_pixel_value_128 = _mm_set1_epi16(max_pixel_value); constexpr int limit_lo_d = (fulld ? 0 : 16) << (targetbits - 8); constexpr int limit_hi_d = fulld ? ((1 << targetbits) - 1) : ((chroma ? 240 : 235) << (targetbits - 8)); @@ -1616,6 +1615,8 @@ void convert_32_to_uintN_sse41(const BYTE *srcp8, BYTE *dstp8, int src_rowsize, const __m128 halfint_plus_rounder_ps = _mm_set1_ps(half_i + 0.5f); const __m128 limit_lo_s_ps = _mm_set1_ps(limit_lo_s / 255.0f); const __m128 limit_lo_plus_rounder_ps = _mm_set1_ps(limit_lo_d + 0.5f); + const __m128 max_dst_pixelvalue = _mm_set1_ps((float)max_pixel_value); // 255, 1023, 4095, 16383, 65535.0 + const __m128 zero = _mm_setzero_ps(); __m128 factor_ps = _mm_set1_ps(factor); // 0-1.0 -> 0..max_pixel_value @@ -1648,17 +1649,17 @@ void convert_32_to_uintN_sse41(const BYTE *srcp8, BYTE *dstp8, int src_rowsize, src_1 = _mm_add_ps(_mm_mul_ps(src_1, factor_ps), limit_lo_plus_rounder_ps); //pixel = (srcp0[x] - limit_lo_s_ps) * factor + half + limit_lo + 0.5f; } + + src_0 = _mm_max_ps(_mm_min_ps(src_0, max_dst_pixelvalue), zero); + src_1 = _mm_max_ps(_mm_min_ps(src_1, max_dst_pixelvalue), zero); result_0 = _mm_cvttps_epi32(src_0); // truncate result_1 = _mm_cvttps_epi32(src_1); if constexpr(sizeof(pixel_t) == 2) { result = _mm_packus_epi32(result_0, result_1); // sse41 - if constexpr(targetbits > 8 && targetbits < 16) { - result = _mm_min_epu16(result, max_pixel_value_128); // sse41, extra clamp for 10, 12, 14 bits - } _mm_store_si128(reinterpret_cast<__m128i *>(dstp + x), result); } else { - result = _mm_packus_epi32(result_0, result_1); + result = _mm_packs_epi32(result_0, result_1); result = _mm_packus_epi16(result, result); // lo 8 byte _mm_storel_epi64(reinterpret_cast<__m128i *>(dstp + x), result); } diff --git a/avs_core/convert/convert_avx2.cpp b/avs_core/convert/convert_avx2.cpp index 5fbda715a..5ba518cb9 100644 --- a/avs_core/convert/convert_avx2.cpp +++ b/avs_core/convert/convert_avx2.cpp @@ -70,7 +70,6 @@ void convert_32_to_uintN_avx2(const BYTE *srcp8, BYTE *dstp8, int src_rowsize, i int src_width = src_rowsize / sizeof(float); const int max_pixel_value = (1 << targetbits) - 1; - const __m256i max_pixel_value_256 = _mm256_set1_epi16(max_pixel_value); const int limit_lo_d = (fulld ? 0 : 16) << (targetbits - 8); const int limit_hi_d = fulld ? ((1 << targetbits) - 1) : ((chroma ? 240 : 235) << (targetbits - 8)); @@ -94,6 +93,8 @@ void convert_32_to_uintN_avx2(const BYTE *srcp8, BYTE *dstp8, int src_rowsize, i const __m256 halfint_plus_rounder_ps = _mm256_set1_ps(half_i + 0.5f); const __m256 limit_lo_s_ps = _mm256_set1_ps(limit_lo_s / 255.0f); const __m256 limit_lo_plus_rounder_ps = _mm256_set1_ps(limit_lo_d + 0.5f); + const __m256 max_dst_pixelvalue = _mm256_set1_ps((float)max_pixel_value); // 255, 1023, 4095, 16383, 65535.0 + const __m256 zero = _mm256_setzero_ps(); __m256 factor_ps = _mm256_set1_ps(factor); @@ -126,18 +127,18 @@ void convert_32_to_uintN_avx2(const BYTE *srcp8, BYTE *dstp8, int src_rowsize, i src_1 = _mm256_fmadd_ps(src_1, factor_ps, limit_lo_plus_rounder_ps); //pixel = (srcp0[x] - limit_lo_s_ps) * factor + half + limit_lo + 0.5f; } + + src_0 = _mm256_max_ps(_mm256_min_ps(src_0, max_dst_pixelvalue), zero); + src_1 = _mm256_max_ps(_mm256_min_ps(src_1, max_dst_pixelvalue), zero); result_0 = _mm256_cvttps_epi32(src_0); // truncate result_1 = _mm256_cvttps_epi32(src_1); if constexpr(sizeof(pixel_t) == 2) { result = _mm256_packus_epi32(result_0, result_1); result = _mm256_permute4x64_epi64(result, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6)); - if (targetbits > 8 && targetbits < 16) { - result = _mm256_min_epu16(result, max_pixel_value_256); // extra clamp for 10, 12, 14 bits - } - _mm256_store_si256(reinterpret_cast<__m256i *>(dstp + x), result); + _mm256_store_si256(reinterpret_cast<__m256i*>(dstp + x), result); } else { - result = _mm256_packus_epi32(result_0, result_1); + result = _mm256_packs_epi32(result_0, result_1); result = _mm256_permute4x64_epi64(result, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6)); __m128i result128_lo = _mm256_castsi256_si128(result); __m128i result128_hi = _mm256_extractf128_si256(result, 1); diff --git a/distrib/Readme/readme.txt b/distrib/Readme/readme.txt index d8e9c15ed..135f79cbe 100644 --- a/distrib/Readme/readme.txt +++ b/distrib/Readme/readme.txt @@ -34,6 +34,8 @@ Short info for plugin writers 20200302 3.5.0 -------------- - New: Native Linux, macOS, and BSD support. +- Fix: ConvertBits 32->8 for extremely out of range float pixel values. + When pixel value in a 32 bit float format video was way out of range and greater than 128 (e.g. instead of 0 to 1.0 for Y plane) then the ConvertBits(8) had artifacts. - Fix potential crash on exit or cache shrink (linux/gcc only?) - Layer: support RGB24 and RGB48 (internally processed as Planar RGB - lossless pre and post conversion) - Fix: RGBP to 444 8-14bit right side artifacts at specific widths @@ -942,14 +944,14 @@ Conditional runtime functions have 10-16 bit/float support for YUV, PlanarRGB an Since RGB is also available as a planar colorspace, the plane statistics functions logically were expanded. New functions -• AverageR, AverageG AverageB like AverageLuma -• RDifference, GDifference, BDifference like LumaDifference(clip1, clip2) -• RDifferenceFromPrevious, GDifferenceFromPrevious, BDifferenceFromPrevious -• RDifferenceToNext, GDifferenceToNext, BDifferenceToNext -• RPlaneMin, GPlaneMin BPlaneMin like YPlaneMin(clip [, float threshold = 0, int offset = 0]) -• RPlaneMax, GPlaneMax BPlaneMax like YPlaneMax(clip [, float threshold = 0, int offset = 0]) -• RPlaneMinMaxDifference, GPlaneMinMaxDifference BPlaneMinMaxDifference like YPlaneMinMaxDifference(clip [, float threshold = 0, int offset = 0]) -• RPlaneMedian, GPlaneMedian, BPlaneMedian like YPlaneMedian(clip [, int offset = 0]) +• AverageR, AverageG AverageB like AverageLuma +• RDifference, GDifference, BDifference like LumaDifference(clip1, clip2) +• RDifferenceFromPrevious, GDifferenceFromPrevious, BDifferenceFromPrevious +• RDifferenceToNext, GDifferenceToNext, BDifferenceToNext +• RPlaneMin, GPlaneMin BPlaneMin like YPlaneMin(clip [, float threshold = 0, int offset = 0]) +• RPlaneMax, GPlaneMax BPlaneMax like YPlaneMax(clip [, float threshold = 0, int offset = 0]) +• RPlaneMinMaxDifference, GPlaneMinMaxDifference BPlaneMinMaxDifference like YPlaneMinMaxDifference(clip [, float threshold = 0, int offset = 0]) +• RPlaneMedian, GPlaneMedian, BPlaneMedian like YPlaneMedian(clip [, int offset = 0]) For float colorspaces the Min, Max, MinMaxDifference and Median functions populate pixel counts for the internal statistics at a 16 bit resolution internally. @@ -1044,12 +1046,12 @@ stackvertical(clip8.ConvertToYUV444().Histogram("levels"), Clip16.ConvertBits(8) [ColorYUV] Now it works for 10-16 bit clips -• Slightly modified "demo" mode when using ColorYUV(showyuv=true) +• Slightly modified "demo" mode when using ColorYUV(showyuv=true) #old: draws YV12 with 16-239 U/V image (448x448) #new: draws YV12 with 16-240 U/V image (450x450) -• New options for "demo" mode when using ColorYUV(showyuv=true) +• New options for "demo" mode when using ColorYUV(showyuv=true) New parameter: bool showyuv_fullrange. Description: Draws YV12 with 0-255 U/V image (512x512) Usage: ColorYUV(showyuv=true, showyuv_fullrange=true) @@ -1062,7 +1064,7 @@ ColorYUV(showyuv=true, bits=10).Info() Luma steps are 16-235-16../0-255-0.. up to 0-65535-0... when bits=16 -• Additional infos for ColorYUV +• Additional infos for ColorYUV - Fixed an uninitialized internal variable regarding pc<->tv conversion, resulting in clips sometimes were expanding to pc range when it wasn't asked. @@ -1079,4 +1081,4 @@ Source filters are automatically detected, specifying MT_SERIALIZED is not neces [Known issues/things] GRunT in MT modes (Avs+ specific) [done: v2502] Overlay blend with fully transparent mask is incorrect, overlaying pixel=0 becomes 1, overlaying pixel=255 becomes 254. -[done: v2676-] Float-type clips: chroma should be zero based: +/-0.5 instead of 0..1 \ No newline at end of file +[done: v2676-] Float-type clips: chroma should be zero based: +/-0.5 instead of 0..1 diff --git a/distrib/Readme/readme_history.txt b/distrib/Readme/readme_history.txt index 83b28f12e..71345f933 100644 --- a/distrib/Readme/readme_history.txt +++ b/distrib/Readme/readme_history.txt @@ -7,6 +7,8 @@ For a more logical (non-historical) arrangement of changes see readme.txt 20200302 3.5.0 -------------- - New: Native Linux, macOS, and BSD support. +- Fix: ConvertBits 32->8 for extremely out of range float pixel values. + When pixel value in a 32 bit float format video was way out of range and greater than 128 (e.g. instead of 0 to 1.0 for Y plane) then the ConvertBits(8) had artifacts. - Fix potential crash on exit or cache shrink (linux/gcc only?) - Layer: support RGB24 and RGB48 (internally processed as Planar RGB - lossless pre and post conversion) - Fix: RGBP to 444 8-14bit right side artifacts at specific widths