From e17f4f80055009bf45b37e1b6d1790bd1c1a93b2 Mon Sep 17 00:00:00 2001
From: Pinterf <pinterfer@gmail.com>
Date: Mon, 17 Feb 2020 14:21:39 +0100
Subject: [PATCH] Fix: clamping ConvertBits 32->8 for extreme out-of-range
 pixel values

---
 avs_core/convert/convert.cpp      | 11 ++++++-----
 avs_core/convert/convert_avx2.cpp | 13 +++++++------
 distrib/Readme/readme.txt         | 26 ++++++++++++++------------
 distrib/Readme/readme_history.txt |  2 ++
 4 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/avs_core/convert/convert.cpp b/avs_core/convert/convert.cpp
index 8caf41dee..b8b0b11dc 100644
--- a/avs_core/convert/convert.cpp
+++ b/avs_core/convert/convert.cpp
@@ -1592,7 +1592,6 @@ void convert_32_to_uintN_sse41(const BYTE *srcp8, BYTE *dstp8, int src_rowsize,
   int src_width = src_rowsize / sizeof(float);
 
   constexpr int max_pixel_value = (1 << targetbits) - 1;
-  const __m128i max_pixel_value_128 = _mm_set1_epi16(max_pixel_value);
 
   constexpr int limit_lo_d = (fulld ? 0 : 16) << (targetbits - 8);
   constexpr int limit_hi_d = fulld ? ((1 << targetbits) - 1) : ((chroma ? 240 : 235) << (targetbits - 8));
@@ -1616,6 +1615,8 @@ void convert_32_to_uintN_sse41(const BYTE *srcp8, BYTE *dstp8, int src_rowsize,
   const __m128 halfint_plus_rounder_ps = _mm_set1_ps(half_i + 0.5f);
   const __m128 limit_lo_s_ps = _mm_set1_ps(limit_lo_s / 255.0f);
   const __m128 limit_lo_plus_rounder_ps = _mm_set1_ps(limit_lo_d + 0.5f);
+  const __m128 max_dst_pixelvalue = _mm_set1_ps((float)max_pixel_value); // 255, 1023, 4095, 16383, 65535.0
+  const __m128 zero = _mm_setzero_ps();
 
   __m128 factor_ps = _mm_set1_ps(factor); // 0-1.0 -> 0..max_pixel_value
 
@@ -1648,17 +1649,17 @@ void convert_32_to_uintN_sse41(const BYTE *srcp8, BYTE *dstp8, int src_rowsize,
         src_1 = _mm_add_ps(_mm_mul_ps(src_1, factor_ps), limit_lo_plus_rounder_ps);
         //pixel = (srcp0[x] - limit_lo_s_ps) * factor + half + limit_lo + 0.5f;
       }
+
+      src_0 = _mm_max_ps(_mm_min_ps(src_0, max_dst_pixelvalue), zero);
+      src_1 = _mm_max_ps(_mm_min_ps(src_1, max_dst_pixelvalue), zero);
       result_0 = _mm_cvttps_epi32(src_0); // truncate
       result_1 = _mm_cvttps_epi32(src_1);
       if constexpr(sizeof(pixel_t) == 2) {
         result = _mm_packus_epi32(result_0, result_1); // sse41
-        if constexpr(targetbits > 8 && targetbits < 16) {
-          result = _mm_min_epu16(result, max_pixel_value_128); // sse41, extra clamp for 10, 12, 14 bits
-        }
         _mm_store_si128(reinterpret_cast<__m128i *>(dstp + x), result);
       }
       else {
-        result = _mm_packus_epi32(result_0, result_1);
+        result = _mm_packs_epi32(result_0, result_1);
         result = _mm_packus_epi16(result, result); // lo 8 byte
         _mm_storel_epi64(reinterpret_cast<__m128i *>(dstp + x), result);
       }
diff --git a/avs_core/convert/convert_avx2.cpp b/avs_core/convert/convert_avx2.cpp
index 5fbda715a..5ba518cb9 100644
--- a/avs_core/convert/convert_avx2.cpp
+++ b/avs_core/convert/convert_avx2.cpp
@@ -70,7 +70,6 @@ void convert_32_to_uintN_avx2(const BYTE *srcp8, BYTE *dstp8, int src_rowsize, i
   int src_width = src_rowsize / sizeof(float);
 
   const int max_pixel_value = (1 << targetbits) - 1;
-  const __m256i max_pixel_value_256 = _mm256_set1_epi16(max_pixel_value);
 
   const int limit_lo_d = (fulld ? 0 : 16) << (targetbits - 8);
   const int limit_hi_d = fulld ? ((1 << targetbits) - 1) : ((chroma ? 240 : 235) << (targetbits - 8));
@@ -94,6 +93,8 @@ void convert_32_to_uintN_avx2(const BYTE *srcp8, BYTE *dstp8, int src_rowsize, i
   const __m256 halfint_plus_rounder_ps = _mm256_set1_ps(half_i + 0.5f);
   const __m256 limit_lo_s_ps = _mm256_set1_ps(limit_lo_s / 255.0f);
   const __m256 limit_lo_plus_rounder_ps = _mm256_set1_ps(limit_lo_d + 0.5f);
+  const __m256 max_dst_pixelvalue = _mm256_set1_ps((float)max_pixel_value); // 255, 1023, 4095, 16383, 65535.0
+  const __m256 zero = _mm256_setzero_ps();
 
   __m256 factor_ps = _mm256_set1_ps(factor);
 
@@ -126,18 +127,18 @@ void convert_32_to_uintN_avx2(const BYTE *srcp8, BYTE *dstp8, int src_rowsize, i
         src_1 = _mm256_fmadd_ps(src_1, factor_ps, limit_lo_plus_rounder_ps);
         //pixel = (srcp0[x] - limit_lo_s_ps) * factor + half + limit_lo + 0.5f;
       }
+
+      src_0 = _mm256_max_ps(_mm256_min_ps(src_0, max_dst_pixelvalue), zero);
+      src_1 = _mm256_max_ps(_mm256_min_ps(src_1, max_dst_pixelvalue), zero);
       result_0 = _mm256_cvttps_epi32(src_0); // truncate
       result_1 = _mm256_cvttps_epi32(src_1);
       if constexpr(sizeof(pixel_t) == 2) {
         result = _mm256_packus_epi32(result_0, result_1);
         result = _mm256_permute4x64_epi64(result, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6));
-        if (targetbits > 8 && targetbits < 16) {
-          result = _mm256_min_epu16(result, max_pixel_value_256); // extra clamp for 10, 12, 14 bits
-        }
-          _mm256_store_si256(reinterpret_cast<__m256i *>(dstp + x), result);
+        _mm256_store_si256(reinterpret_cast<__m256i*>(dstp + x), result);
       }
       else {
-        result = _mm256_packus_epi32(result_0, result_1);
+        result = _mm256_packs_epi32(result_0, result_1);
         result = _mm256_permute4x64_epi64(result, (0 << 0) | (2 << 2) | (1 << 4) | (3 << 6));
         __m128i result128_lo = _mm256_castsi256_si128(result);
         __m128i result128_hi = _mm256_extractf128_si256(result, 1);
diff --git a/distrib/Readme/readme.txt b/distrib/Readme/readme.txt
index d8e9c15ed..135f79cbe 100644
--- a/distrib/Readme/readme.txt
+++ b/distrib/Readme/readme.txt
@@ -34,6 +34,8 @@ Short info for plugin writers
 20200302 3.5.0
 --------------
 - New: Native Linux, macOS, and BSD support.
+- Fix: ConvertBits 32->8 for extremely out of range float pixel values.
+  When pixel value in a 32 bit float format video was way out of range and greater than 128 (e.g. instead of 0 to 1.0 for Y plane) then the ConvertBits(8) had artifacts.
 - Fix potential crash on exit or cache shrink (linux/gcc only?)
 - Layer: support RGB24 and RGB48 (internally processed as Planar RGB - lossless pre and post conversion)
 - Fix: RGBP to 444 8-14bit right side artifacts at specific widths
@@ -942,14 +944,14 @@ Conditional runtime functions have 10-16 bit/float support for YUV, PlanarRGB an
 Since RGB is also available as a planar colorspace, the plane statistics functions logically were expanded.
 
 New functions
-• AverageR, AverageG AverageB like AverageLuma
-• RDifference, GDifference, BDifference like LumaDifference(clip1, clip2)
-• RDifferenceFromPrevious, GDifferenceFromPrevious, BDifferenceFromPrevious
-• RDifferenceToNext, GDifferenceToNext, BDifferenceToNext
-• RPlaneMin, GPlaneMin BPlaneMin like YPlaneMin(clip [, float threshold = 0, int offset = 0])
-• RPlaneMax, GPlaneMax BPlaneMax like YPlaneMax(clip [, float threshold = 0, int offset = 0])
-• RPlaneMinMaxDifference, GPlaneMinMaxDifference BPlaneMinMaxDifference like YPlaneMinMaxDifference(clip [, float threshold = 0, int offset = 0])
-• RPlaneMedian, GPlaneMedian, BPlaneMedian like YPlaneMedian(clip [, int offset = 0])
+ā€¢ AverageR, AverageG AverageB like AverageLuma
+ā€¢ RDifference, GDifference, BDifference like LumaDifference(clip1, clip2)
+ā€¢ RDifferenceFromPrevious, GDifferenceFromPrevious, BDifferenceFromPrevious
+ā€¢ RDifferenceToNext, GDifferenceToNext, BDifferenceToNext
+ā€¢ RPlaneMin, GPlaneMin BPlaneMin like YPlaneMin(clip [, float threshold = 0, int offset = 0])
+ā€¢ RPlaneMax, GPlaneMax BPlaneMax like YPlaneMax(clip [, float threshold = 0, int offset = 0])
+ā€¢ RPlaneMinMaxDifference, GPlaneMinMaxDifference BPlaneMinMaxDifference like YPlaneMinMaxDifference(clip [, float threshold = 0, int offset = 0])
+ā€¢ RPlaneMedian, GPlaneMedian, BPlaneMedian like YPlaneMedian(clip [, int offset = 0])
 
 For float colorspaces the Min, Max, MinMaxDifference and Median functions populate pixel counts for the internal statistics at a 16 bit resolution internally. 
 
@@ -1044,12 +1046,12 @@ stackvertical(clip8.ConvertToYUV444().Histogram("levels"), Clip16.ConvertBits(8)
 [ColorYUV] 
 Now it works for 10-16 bit clips
 
-• Slightly modified "demo" mode when using ColorYUV(showyuv=true) 
+ā€¢ Slightly modified "demo" mode when using ColorYUV(showyuv=true)
 
 #old: draws YV12 with 16-239 U/V image (448x448)
 #new: draws YV12 with 16-240 U/V image (450x450)
  
-• New options for "demo" mode when using ColorYUV(showyuv=true) 
+ā€¢ New options for "demo" mode when using ColorYUV(showyuv=true)
 New parameter: bool showyuv_fullrange.
 Description: Draws YV12 with 0-255 U/V image (512x512)
 Usage: ColorYUV(showyuv=true, showyuv_fullrange=true)
@@ -1062,7 +1064,7 @@ ColorYUV(showyuv=true, bits=10).Info()
  
 Luma steps are 16-235-16../0-255-0.. up to 0-65535-0... when bits=16
  
-• Additional infos for ColorYUV
+ā€¢ Additional infos for ColorYUV
 
 - Fixed an uninitialized internal variable regarding pc<->tv conversion, 
   resulting in clips sometimes were expanding to pc range when it wasn't asked.
@@ -1079,4 +1081,4 @@ Source filters are automatically detected, specifying MT_SERIALIZED is not neces
 [Known issues/things]
 GRunT in MT modes (Avs+ specific)
 [done: v2502] Overlay blend with fully transparent mask is incorrect, overlaying pixel=0 becomes 1, overlaying pixel=255 becomes 254.
-[done: v2676-] Float-type clips: chroma should be zero based: +/-0.5 instead of 0..1
\ No newline at end of file
+[done: v2676-] Float-type clips: chroma should be zero based: +/-0.5 instead of 0..1
diff --git a/distrib/Readme/readme_history.txt b/distrib/Readme/readme_history.txt
index 83b28f12e..71345f933 100644
--- a/distrib/Readme/readme_history.txt
+++ b/distrib/Readme/readme_history.txt
@@ -7,6 +7,8 @@ For a more logical (non-historical) arrangement of changes see readme.txt
 20200302 3.5.0
 --------------
 - New: Native Linux, macOS, and BSD support.
+- Fix: ConvertBits 32->8 for extremely out of range float pixel values.
+  When pixel value in a 32 bit float format video was way out of range and greater than 128 (e.g. instead of 0 to 1.0 for Y plane) then the ConvertBits(8) had artifacts.
 - Fix potential crash on exit or cache shrink (linux/gcc only?)
 - Layer: support RGB24 and RGB48 (internally processed as Planar RGB - lossless pre and post conversion)
 - Fix: RGBP to 444 8-14bit right side artifacts at specific widths