Merge pull request #189 from CVCUDA/feat/milesp/release_cvcuda_0110

feat: adding release v0.11.0 of cvcuda
CVCUDA · Sep 5, 2024 · 84e3dcd · 84e3dcd
2 parents f769fe4 + 8890637
commit 84e3dcd
Show file tree

Hide file tree

Showing 79 changed files with 968 additions and 500 deletions.
diff --git a/3rdparty/pybind11 b/3rdparty/pybind11
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -17,13 +17,13 @@ cmake_minimum_required(VERSION 3.20.1)
 
 # We need to check this variable before starting a CUDA project - otherwise it will appear
 # as set, with the default value pointing to the oldest supported architecture (52 as of CUDA 11.8)
-if(DEFINED CMAKE_CUDA_ARCHITECTURES)
+if(CMAKE_CUDA_ARCHITECTURES)
     set(USE_CMAKE_CUDA_ARCHITECTURES TRUE)
 endif()
 
 project(cvcuda
         LANGUAGES C CXX
-        VERSION 0.10.0
+        VERSION 0.11.0
         DESCRIPTION "CUDA-accelerated Computer Vision algorithms"
 )
 
@@ -49,6 +49,7 @@ endif()
 option(BUILD_TESTS "Enable testsuite" OFF)
 option(BUILD_PYTHON "Build python bindings" OFF)
 option(BUILD_BENCH "Build benchmark" OFF)
+option(BUILD_DOCS "Build documentation" OFF)
 option(ENABLE_SANITIZER "Enabled sanitized build" OFF)
 
 # Configure build tree ======================

diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@
 
 [![License](https://img.shields.io/badge/License-Apache_2.0-yellogreen.svg)](https://opensource.org/licenses/Apache-2.0)
 
-![Version](https://img.shields.io/badge/Version-v0.10.1--beta-blue)
+![Version](https://img.shields.io/badge/Version-v0.11.0--beta-blue)
 
 ![Platform](https://img.shields.io/badge/Platform-linux--64_%7C_win--64_wsl2%7C_aarch64-gray)
 
@@ -60,10 +60,28 @@ To get a local copy up and running follow these steps.
 - [CV-CUDA Samples] require driver r535 or later to run and are only officially supported with CUDA 12.
 - Only one CUDA version (CUDA 11.x or CUDA 12.x) of CV-CUDA packages (Debian packages, tarballs, Python Wheels) can be installed at a time. Please uninstall all packages from a given CUDA version before installing packages from a different version.
 - Documentation built on Ubuntu 20.04 needs an up-to-date version of sphinx (`pip install --upgrade sphinx`) as well as explicitly parsing the system's default python version ` ./ci/build_docs path/to/build -DPYTHON_VERSIONS="<py_ver>"`.
-- Python bindings installed via Debian packages and Python tests fail with Numpy 2.0. We recommend using an older version of Numpy (e.g. 1.26) until we have implemented a fix.
 - The Resize and RandomResizedCrop operators incorrectly interpolate pixel values near the boundary of an image or tensor when using cubic interpolation. This will be fixed in an upcoming release.
-- Cache/resource management introduced in v0.10 add micro-second-level overhead to Python operator calls. Based on the performance analysis of our Python samples, we expect the production- and pipeline-level impact to be negligible. CUDA kernel and C++ call performance is not affected. We aim to investigate and reduce this overhead further in a future release.
-- Sporadic Pybind11-deallocation crashes have been reported in long-lasting multi-threaded Python pipelines with externally allocated memory (eg wrapped Pytorch buffers). We are evaluating an upgrade of Pybind11 (currently using 2.10) as a potential fix in an upcoming release.
+- The CvtColor operator incorrectly computes the data location of the second chromaticity channel for conversions that involve YUV(420) semi-planar formats. This issue persists through the current release and we intend to address this bug in CV-CUDA v0.12. We do not recommend using these formats.
+  - Known affected formats:
+    - NVCV_COLOR_YUV2RGB_I420
+    - NVCV_COLOR_RGB2YUV_I420
+    - NVCV_COLOR_YUV2BGR_I420
+    - NVCV_COLOR_BGR2YUV_I420
+    - NVCV_COLOR_YUV2RGBA_I420
+    - NVCV_COLOR_RGBA2YUV_I420
+    - NVCV_COLOR_YUV2BGRA_I420
+    - NVCV_COLOR_BGRA2YUV_I420
+    - NVCV_COLOR_RGB2YUV_I420
+    - NVCV_COLOR_YUV2RGB_YV12
+    - NVCV_COLOR_RGB2YUV_YV12
+    - NVCV_COLOR_YUV2BGR_YV12
+    - NVCV_COLOR_BGR2YUV_YV12
+    - NVCV_COLOR_YUV2RGBA_YV12
+    - NVCV_COLOR_RGBA2YUV_YV12
+    - NVCV_COLOR_YUV2BGRA_YV12
+    - NVCV_COLOR_BGRA2YUV_YV12
+    - NVCV_COLOR_RGB2YUV_YV12
+    - NVCV_COLOR_YUV2GRAY_420
 
 ### Installation
 
@@ -209,7 +227,6 @@ For instructions on how to build samples from source and run them, see the [Samp
 Install the dependencies required for running the tests:
 - python3, python3-pip: to run python bindings tests
 - torch: dependencies needed by python bindings tests
-- numpy: known limitation: Python tests fail with numpy 2.0. We recommend using an older version (eg 1.26) until we have implemented a fix.
 
 On Ubuntu >= 20.04, install the following packages using `apt` and `pip`:
 ```shell

diff --git a/bench/python/all_ops/op_adaptivethreshold.py b/bench/python/all_ops/op_adaptivethreshold.py
@@ -23,6 +23,7 @@
 
 class OpAdaptiveThreshold(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.maxval = 255.0
         self.adaptive_method = cvcuda.AdaptiveThresholdType.GAUSSIAN_C
         self.threshold_type = cvcuda.ThresholdType.BINARY

diff --git a/bench/python/all_ops/op_as_image.py b/bench/python/all_ops/op_as_image.py
@@ -23,6 +23,7 @@
 
 class OpAsImageFromNVCVImage(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         # dummy run that does not use cache
         img = nvcv.Image((128, 128), nvcv.Format.RGBA8)
 

diff --git a/bench/python/all_ops/op_as_images.py b/bench/python/all_ops/op_as_images.py
@@ -23,6 +23,7 @@
 
 class OpAsImagesFromNVCVImage(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         # dummy run that does not use cache
         nvcv.ImageBatchVarShape(100)
         img = nvcv.Image((128, 128), nvcv.Format.RGBA8)

diff --git a/bench/python/all_ops/op_averageblur.py b/bench/python/all_ops/op_averageblur.py
@@ -23,6 +23,7 @@
 
 class OpAverageBlur(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.kernel_size = (3, 3)
         self.kernel_anchor = (-1, -1)
 

diff --git a/bench/python/all_ops/op_blurbox.py b/bench/python/all_ops/op_blurbox.py
@@ -26,6 +26,7 @@
 
 class OpBlurBox(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.kernel_size = 5
 
         data = read_image(os.path.join(self.assets_dir, "brooklyn.jpg"))

diff --git a/bench/python/all_ops/op_boundingbox.py b/bench/python/all_ops/op_boundingbox.py
@@ -26,6 +26,7 @@
 
 class OpBoundingBox(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.border_color = (0, 255, 0, 255)
         self.fill_color = (0, 0, 255, 0)
         self.thickness = 5

diff --git a/bench/python/all_ops/op_brightnesscontrast.py b/bench/python/all_ops/op_brightnesscontrast.py
@@ -24,6 +24,7 @@
 
 class OpBrightnessContrast(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         brightness = torch.tensor([1.2]).cuda(self.device_id)
         self.brightness = cvcuda.as_tensor(brightness, "N")
 

diff --git a/bench/python/all_ops/op_cache_limit.py b/bench/python/all_ops/op_cache_limit.py
@@ -0,0 +1,114 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import nvcv
+import torch
+
+# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise
+# things may throw unexpected errors.
+import pycuda.driver as cuda  # noqa: F401
+from bench_utils import AbstractOpBase
+
+# For the following setup depicted in the table, we have to repeatedly call the functions: cudaMalloc and/or
+# cudaFree.
+#
+# ---------------------------------------------------------------------
+# | shape\cache limit  |   small                         large        |
+# |-------------------------------------------------------------------|
+# | non-random         |   cudaMalloc + cudaFree       - (best-case)  |
+# | random             |   cudaMalloc + cudaFree       cudaMalloc     |
+# ---------------------------------------------------------------------
+#
+# Due to the this table, we benchmark three scenarios: {non-random, small}, {non-random, large},
+# {random, large}
+
+
+# Base class for cache limit benchmarks, to ensure all three classes have the same overhead, leading to
+# consistent numbers.
+class BaseOpCacheLimit(AbstractOpBase):
+    def setup(self, input, new_cache_limit, low, high):
+        super().setup(input)
+
+        # make this benchmark compatible with older cvcuda/nvncv versions
+        if hasattr(nvcv, "set_cache_limit_inbytes"):
+            nvcv.set_cache_limit_inbytes(new_cache_limit)
+
+        # We don't have access to the outer benchmark iterations (default=10), so we have to create our own
+        # counter.
+        self.max_iter_outer = 10
+        self.iter_outer = 0
+
+        # Number of "random" tensors created per benchmarked run
+        self.n_tensors = 20
+        self.hw = torch.randint(
+            low=low, high=high, size=(self.max_iter_outer, 2, self.n_tensors)
+        )
+
+    def run(self, input):
+        # If we exceed the outer bench iterations, we return.
+        # If we didn't return, we might re-use the cache, which we specifically don't want for
+        # "OpCacheLimitLargeAndRandom".
+        # For the other classes (OpCacheLimitZero, OpCacheLimitLarge), we could continue running the
+        # benchmarks, but then we would not get comparable numbers between all three classes
+        if self.iter_outer >= self.max_iter_outer:
+            return
+
+        for ii in range(self.n_tensors):
+            shape = (
+                self.hw[self.iter_outer, 0, ii].item(),
+                self.hw[self.iter_outer, 1, ii].item(),
+                3,
+            )
+            _ = nvcv.Tensor(shape, nvcv.Type.F32, nvcv.TensorLayout.HWC)
+
+        self.iter_outer += 1
+        return
+
+
+# This is the {non-random, small} case. The smallest we can choose is 0, so we set the cache limit to 0 and
+# effectively disable the cache
+class OpCacheLimitZero(BaseOpCacheLimit):
+    def setup(self, input):
+        # Set the cache limit to 0 for this benchmark
+        # low=1000, high=1001 results in always creating tensor's of shape (1000,1000,3)
+        super().setup(input, 0, low=1000, high=1001)
+
+    def run(self, input):
+        super().run(input)
+
+
+# This is the {non-random, large} case. This is the best case scenario, always re-using the cache
+class OpCacheLimitLarge(BaseOpCacheLimit):
+    def setup(self, input):
+        # Set the cache limit to the total gpu memory for this benchmark
+        # low=1000, high=1001 results in always creating tensor's of shape (1000,1000,3)
+        total = torch.cuda.mem_get_info()[1]
+        super().setup(input, total, low=1000, high=1001)
+
+    def run(self, input):
+        super().run(input)
+
+
+# This is the {random, large} case. This is the worst case scenario, never re-using the cache
+class OpCacheLimitLargeAndRandom(BaseOpCacheLimit):
+    def setup(self, input):
+        # Set the cache limit to the total gpu memory for this benchmark
+        # low=1000, high=2000 results in always creating tensor's of random shape
+        # between [(1000,1000,3), (1999,1999,3)]
+        total = torch.cuda.mem_get_info()[1]
+        super().setup(input, total, low=1000, high=2000)
+
+    def run(self, input):
+        super().run(input)
diff --git a/bench/python/all_ops/op_centercrop.py b/bench/python/all_ops/op_centercrop.py
@@ -23,6 +23,7 @@
 
 class OpCenterCrop(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         width, height = input.shape[2], input.shape[1]
         self.crop_size = [width // 2, height // 2]
 

diff --git a/bench/python/all_ops/op_composite.py b/bench/python/all_ops/op_composite.py
@@ -26,6 +26,7 @@
 
 class OpComposite(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         data = read_image(os.path.join(self.assets_dir, "brooklyn.jpg"))
         data = data.moveaxis(0, -1).contiguous()  # From CHW to HWC
         data = data.cuda(self.device_id)

diff --git a/bench/python/all_ops/op_convertto.py b/bench/python/all_ops/op_convertto.py
@@ -24,6 +24,7 @@
 
 class OpConvertTo(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.target_dtype = nvcv.Type.F32
         self.offset = 10.2
         self.scale = 1 / 255.0

diff --git a/bench/python/all_ops/op_copymakeborder.py b/bench/python/all_ops/op_copymakeborder.py
@@ -23,6 +23,7 @@
 
 class OpCopyMakeBorder(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.border_mode = cvcuda.Border.CONSTANT
         self.border_values = [255, 0, 0]  # Border values for 3 channel RGB input.
         self.top = 30

diff --git a/bench/python/all_ops/op_customcrop.py b/bench/python/all_ops/op_customcrop.py
@@ -24,6 +24,7 @@
 
 class OpCustomCrop(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.rectI = nvcv.RectI(x=30, y=40, width=420, height=390)
 
     def run(self, input):

diff --git a/bench/python/all_ops/op_cvtcolor.py b/bench/python/all_ops/op_cvtcolor.py
@@ -23,15 +23,15 @@
 
 class OpCvtColorRGB2GRAY(AbstractOpBase):
     def setup(self, input):
-        pass
+        super().setup(input)
 
     def run(self, input):
         return cvcuda.cvtcolor(input, cvcuda.ColorConversion.RGB2GRAY)
 
 
 class OpCvtColorRGB2BGR(AbstractOpBase):
     def setup(self, input):
-        pass
+        super().setup(input)
 
     def run(self, input):
         return cvcuda.cvtcolor(input, cvcuda.ColorConversion.RGB2BGR)
diff --git a/bench/python/all_ops/op_flip.py b/bench/python/all_ops/op_flip.py
@@ -23,6 +23,7 @@
 
 class OpFlipX(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.flip_code = 0  # means flipping around x axis.
 
     def run(self, input):
@@ -31,6 +32,7 @@ def run(self, input):
 
 class OpFlipY(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.flip_code = 1  # means flipping around y axis.
 
     def run(self, input):
@@ -39,6 +41,7 @@ def run(self, input):
 
 class OpFlipXY(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.flip_code = -1  # means flipping around x and y axis.
 
     def run(self, input):

diff --git a/bench/python/all_ops/op_gaussianblur.py b/bench/python/all_ops/op_gaussianblur.py
@@ -23,6 +23,7 @@
 
 class OpGaussianBlur(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.kernel_size = (3, 3)
         self.sigma = (5, 5)
 

diff --git a/bench/python/all_ops/op_hqresize.py b/bench/python/all_ops/op_hqresize.py
@@ -23,6 +23,7 @@
 
 class OpHqResizeDown(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.resize_width = 640
         self.resize_height = 420
 
@@ -39,6 +40,7 @@ def run(self, input):
 
 class OpHqResizeUp(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.resize_width = 1920
         self.resize_height = 1280
 

diff --git a/bench/python/all_ops/op_inpaint.py b/bench/python/all_ops/op_inpaint.py
@@ -26,6 +26,7 @@
 
 class OpInpaint(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         data = read_image(os.path.join(self.assets_dir, "brooklyn.jpg"))
         mask = read_image(os.path.join(self.assets_dir, "countour_lines.jpg"))
         # Binarize the mask

diff --git a/bench/python/all_ops/op_jointbilateral.py b/bench/python/all_ops/op_jointbilateral.py
@@ -26,6 +26,7 @@
 
 class OpJointBilateral(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.diameter = 5
         self.sigma_color = 50
         self.sigma_space = 1

diff --git a/bench/python/all_ops/op_laplacian.py b/bench/python/all_ops/op_laplacian.py
@@ -23,6 +23,7 @@
 
 class OpLaplacian(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.kernel_size = 3
         self.scale = 2.0