From 8890637a1ff3cd290f139b2a3130b47bd83eb10d Mon Sep 17 00:00:00 2001
From: Miles Price <milesp@nvidia.com>
Date: Wed, 4 Sep 2024 16:46:46 -0700
Subject: [PATCH] feat: adding release v0.11.0 of cvcuda

---
 3rdparty/pybind11                             |   2 +-
 CMakeLists.txt                                |   5 +-
 README.md                                     |  27 +-
 bench/python/all_ops/op_adaptivethreshold.py  |   1 +
 bench/python/all_ops/op_as_image.py           |   1 +
 bench/python/all_ops/op_as_images.py          |   1 +
 bench/python/all_ops/op_averageblur.py        |   1 +
 bench/python/all_ops/op_blurbox.py            |   1 +
 bench/python/all_ops/op_boundingbox.py        |   1 +
 bench/python/all_ops/op_brightnesscontrast.py |   1 +
 bench/python/all_ops/op_cache_limit.py        | 114 +++++
 bench/python/all_ops/op_centercrop.py         |   1 +
 bench/python/all_ops/op_composite.py          |   1 +
 bench/python/all_ops/op_convertto.py          |   1 +
 bench/python/all_ops/op_copymakeborder.py     |   1 +
 bench/python/all_ops/op_customcrop.py         |   1 +
 bench/python/all_ops/op_cvtcolor.py           |   4 +-
 bench/python/all_ops/op_flip.py               |   3 +
 bench/python/all_ops/op_gaussianblur.py       |   1 +
 bench/python/all_ops/op_hqresize.py           |   2 +
 bench/python/all_ops/op_inpaint.py            |   1 +
 bench/python/all_ops/op_jointbilateral.py     |   1 +
 bench/python/all_ops/op_laplacian.py          |   1 +
 bench/python/all_ops/op_morphology.py         |   4 +
 bench/python/all_ops/op_nms.py                |   1 +
 bench/python/all_ops/op_normalize.py          |   1 +
 bench/python/all_ops/op_randomresizedcrop.py  |   1 +
 bench/python/all_ops/op_reformat.py           |   3 +-
 bench/python/all_ops/op_remap.py              |   1 +
 bench/python/all_ops/op_reshape.py            |   1 +
 bench/python/all_ops/op_resize.py             |   2 +
 .../op_resize_crop_convert_reformat.py        |   1 +
 bench/python/all_ops/op_rotate.py             |   1 +
 bench/python/all_ops/op_sift.py               |   1 +
 bench/python/all_ops/op_threshold.py          |   1 +
 bench/python/all_ops/op_warpaffine.py         |   2 +
 bench/python/all_ops/op_warpperspective.py    |   2 +
 bench/python/bench_utils.py                   |   4 +-
 bench/python/run_bench.py                     |   7 +
 cmake/PrintConfig.cmake                       |   8 +-
 docker/build20.04/Dockerfile                  |   5 +-
 docker/build22.04/Dockerfile                  |   4 +-
 docker/config                                 |   1 +
 docker/devel20.04/Dockerfile                  |  14 +-
 docker/devel22.04/Dockerfile                  |  16 +-
 docker/env_devel_linux.sh                     |   2 +-
 docker/update_build_image.sh                  |   3 +-
 docker/update_devel_image.sh                  |   7 +-
 docs/CMakeLists.txt                           |   9 +-
 docs/sphinx/gen_py_doc_rsts.py                | 231 ----------
 docs/sphinx/index.rst                         |   2 +-
 docs/sphinx/modules/c_algos.rst               |   6 +-
 docs/sphinx/modules/c_modules.rst             |   4 +-
 docs/sphinx/modules/cpp_algos.rst             |   6 +-
 docs/sphinx/modules/cpp_modules.rst           |   2 +-
 docs/sphinx/modules/python_modules.rst        |   3 +-
 .../{python_algos.rst => python_ops.rst}      |   7 +-
 docs/sphinx/modules/python_ops_aux.rst        |  25 ++
 docs/sphinx/relnotes/v0.10.0-beta.rst         |   8 +-
 docs/sphinx/relnotes/v0.11.0-beta.rst         |  84 ++++
 python/mod_cvcuda/CvtColorUtil.cpp            |   2 +-
 python/mod_cvcuda/CvtColorUtil.hpp            |   2 +-
 python/mod_nvcv/Cache.cpp                     |  39 +-
 python/mod_nvcv/Cache.hpp                     |   2 +-
 python/setup.py.in                            |   2 +-
 samples/scripts/run_samples.sh                |   1 +
 src/CMakeLists.txt                            |   3 +
 .../priv/legacy/CvCudaLegacyHelpers.cpp       |   4 +-
 src/nvcv/CMakeLists.txt                       |  17 +-
 src/nvcv/cmake/ConfigBuildTree.cmake          |   8 +
 src/nvcv/src/CMakeLists.txt                   |  10 +-
 src/nvcv/src/include/nvcv/Export.h            |   6 +-
 src/nvcv/util/CMakeLists.txt                  |   4 +-
 tests/cvcuda/python/test_opmatch.py           |  18 +-
 tests/cvcuda/system/TestOpHQResize.cpp        | 416 +++++++++++-------
 tests/cvcuda/system/TestOpOSD.cpp             |  89 ++++
 tests/nvcv_types/CMakeLists.txt               |  47 +-
 .../cudatools_unit/TestLegacyHelpers.cpp      | 137 +++++-
 tests/nvcv_types/standalone/CMakeLists.txt    |   8 +-
 79 files changed, 968 insertions(+), 500 deletions(-)
 create mode 100644 bench/python/all_ops/op_cache_limit.py
 delete mode 100644 docs/sphinx/gen_py_doc_rsts.py
 rename docs/sphinx/modules/{python_algos.rst => python_ops.rst} (88%)
 create mode 100644 docs/sphinx/modules/python_ops_aux.rst
 create mode 100644 docs/sphinx/relnotes/v0.11.0-beta.rst

diff --git a/3rdparty/pybind11 b/3rdparty/pybind11
index 0694ec6a1..941f45bcb 160000
--- a/3rdparty/pybind11
+++ b/3rdparty/pybind11
@@ -1 +1 @@
-Subproject commit 0694ec6a15863bff2e0ea5efe07c78de39b9a33c
+Subproject commit 941f45bcb51457884fa1afd6e24a67377d70f75c
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 94260676e..13378bf1e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,13 +17,13 @@ cmake_minimum_required(VERSION 3.20.1)
 
 # We need to check this variable before starting a CUDA project - otherwise it will appear
 # as set, with the default value pointing to the oldest supported architecture (52 as of CUDA 11.8)
-if(DEFINED CMAKE_CUDA_ARCHITECTURES)
+if(CMAKE_CUDA_ARCHITECTURES)
     set(USE_CMAKE_CUDA_ARCHITECTURES TRUE)
 endif()
 
 project(cvcuda
         LANGUAGES C CXX
-        VERSION 0.10.0
+        VERSION 0.11.0
         DESCRIPTION "CUDA-accelerated Computer Vision algorithms"
 )
 
@@ -49,6 +49,7 @@ endif()
 option(BUILD_TESTS "Enable testsuite" OFF)
 option(BUILD_PYTHON "Build python bindings" OFF)
 option(BUILD_BENCH "Build benchmark" OFF)
+option(BUILD_DOCS "Build documentation" OFF)
 option(ENABLE_SANITIZER "Enabled sanitized build" OFF)
 
 # Configure build tree ======================
diff --git a/README.md b/README.md
index 3e232d8bf..07f84fec1 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@
 
 [![License](https://img.shields.io/badge/License-Apache_2.0-yellogreen.svg)](https://opensource.org/licenses/Apache-2.0)
 
-![Version](https://img.shields.io/badge/Version-v0.10.1--beta-blue)
+![Version](https://img.shields.io/badge/Version-v0.11.0--beta-blue)
 
 ![Platform](https://img.shields.io/badge/Platform-linux--64_%7C_win--64_wsl2%7C_aarch64-gray)
 
@@ -60,10 +60,28 @@ To get a local copy up and running follow these steps.
 - [CV-CUDA Samples] require driver r535 or later to run and are only officially supported with CUDA 12.
 - Only one CUDA version (CUDA 11.x or CUDA 12.x) of CV-CUDA packages (Debian packages, tarballs, Python Wheels) can be installed at a time. Please uninstall all packages from a given CUDA version before installing packages from a different version.
 - Documentation built on Ubuntu 20.04 needs an up-to-date version of sphinx (`pip install --upgrade sphinx`) as well as explicitly parsing the system's default python version ` ./ci/build_docs path/to/build -DPYTHON_VERSIONS="<py_ver>"`.
-- Python bindings installed via Debian packages and Python tests fail with Numpy 2.0. We recommend using an older version of Numpy (e.g. 1.26) until we have implemented a fix.
 - The Resize and RandomResizedCrop operators incorrectly interpolate pixel values near the boundary of an image or tensor when using cubic interpolation. This will be fixed in an upcoming release.
-- Cache/resource management introduced in v0.10 add micro-second-level overhead to Python operator calls. Based on the performance analysis of our Python samples, we expect the production- and pipeline-level impact to be negligible. CUDA kernel and C++ call performance is not affected. We aim to investigate and reduce this overhead further in a future release.​
-- Sporadic Pybind11-deallocation crashes have been reported in long-lasting multi-threaded Python pipelines with externally allocated memory (eg wrapped Pytorch buffers). We are evaluating an upgrade of Pybind11 (currently using 2.10) as a potential fix in an upcoming release.
+- The CvtColor operator incorrectly computes the data location of the second chromaticity channel for conversions that involve YUV(420) semi-planar formats. This issue persists through the current release and we intend to address this bug in CV-CUDA v0.12. We do not recommend using these formats.​
+  - Known affected formats:​
+    - NVCV_COLOR_YUV2RGB_I420​
+    - NVCV_COLOR_RGB2YUV_I420​
+    - NVCV_COLOR_YUV2BGR_I420​
+    - NVCV_COLOR_BGR2YUV_I420​
+    - NVCV_COLOR_YUV2RGBA_I420​
+    - NVCV_COLOR_RGBA2YUV_I420​
+    - NVCV_COLOR_YUV2BGRA_I420​
+    - NVCV_COLOR_BGRA2YUV_I420​
+    - NVCV_COLOR_RGB2YUV_I420​
+    - NVCV_COLOR_YUV2RGB_YV12​
+    - NVCV_COLOR_RGB2YUV_YV12​
+    - NVCV_COLOR_YUV2BGR_YV12​
+    - NVCV_COLOR_BGR2YUV_YV12​
+    - NVCV_COLOR_YUV2RGBA_YV12​
+    - NVCV_COLOR_RGBA2YUV_YV12​
+    - NVCV_COLOR_YUV2BGRA_YV12​
+    - NVCV_COLOR_BGRA2YUV_YV12​
+    - NVCV_COLOR_RGB2YUV_YV12​
+    - NVCV_COLOR_YUV2GRAY_420​
 
 ### Installation
 
@@ -209,7 +227,6 @@ For instructions on how to build samples from source and run them, see the [Samp
 Install the dependencies required for running the tests:
 - python3, python3-pip: to run python bindings tests
 - torch: dependencies needed by python bindings tests
-- numpy: known limitation: Python tests fail with numpy 2.0. We recommend using an older version (eg 1.26) until we have implemented a fix.
 
 On Ubuntu >= 20.04, install the following packages using `apt` and `pip`:
 ```shell
diff --git a/bench/python/all_ops/op_adaptivethreshold.py b/bench/python/all_ops/op_adaptivethreshold.py
index 1d7b09fbc..a38728382 100644
--- a/bench/python/all_ops/op_adaptivethreshold.py
+++ b/bench/python/all_ops/op_adaptivethreshold.py
@@ -23,6 +23,7 @@
 
 class OpAdaptiveThreshold(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.maxval = 255.0
         self.adaptive_method = cvcuda.AdaptiveThresholdType.GAUSSIAN_C
         self.threshold_type = cvcuda.ThresholdType.BINARY
diff --git a/bench/python/all_ops/op_as_image.py b/bench/python/all_ops/op_as_image.py
index bae98afa5..993ed1841 100644
--- a/bench/python/all_ops/op_as_image.py
+++ b/bench/python/all_ops/op_as_image.py
@@ -23,6 +23,7 @@
 
 class OpAsImageFromNVCVImage(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         # dummy run that does not use cache
         img = nvcv.Image((128, 128), nvcv.Format.RGBA8)
 
diff --git a/bench/python/all_ops/op_as_images.py b/bench/python/all_ops/op_as_images.py
index d5022cf23..ed5132a08 100644
--- a/bench/python/all_ops/op_as_images.py
+++ b/bench/python/all_ops/op_as_images.py
@@ -23,6 +23,7 @@
 
 class OpAsImagesFromNVCVImage(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         # dummy run that does not use cache
         nvcv.ImageBatchVarShape(100)
         img = nvcv.Image((128, 128), nvcv.Format.RGBA8)
diff --git a/bench/python/all_ops/op_averageblur.py b/bench/python/all_ops/op_averageblur.py
index e6dee83eb..444a65b64 100644
--- a/bench/python/all_ops/op_averageblur.py
+++ b/bench/python/all_ops/op_averageblur.py
@@ -23,6 +23,7 @@
 
 class OpAverageBlur(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.kernel_size = (3, 3)
         self.kernel_anchor = (-1, -1)
 
diff --git a/bench/python/all_ops/op_blurbox.py b/bench/python/all_ops/op_blurbox.py
index ea37e00ba..ba32fccfe 100644
--- a/bench/python/all_ops/op_blurbox.py
+++ b/bench/python/all_ops/op_blurbox.py
@@ -26,6 +26,7 @@
 
 class OpBlurBox(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.kernel_size = 5
 
         data = read_image(os.path.join(self.assets_dir, "brooklyn.jpg"))
diff --git a/bench/python/all_ops/op_boundingbox.py b/bench/python/all_ops/op_boundingbox.py
index cc08e9703..4faedfe37 100644
--- a/bench/python/all_ops/op_boundingbox.py
+++ b/bench/python/all_ops/op_boundingbox.py
@@ -26,6 +26,7 @@
 
 class OpBoundingBox(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.border_color = (0, 255, 0, 255)
         self.fill_color = (0, 0, 255, 0)
         self.thickness = 5
diff --git a/bench/python/all_ops/op_brightnesscontrast.py b/bench/python/all_ops/op_brightnesscontrast.py
index 58d9cb833..efe825e0e 100644
--- a/bench/python/all_ops/op_brightnesscontrast.py
+++ b/bench/python/all_ops/op_brightnesscontrast.py
@@ -24,6 +24,7 @@
 
 class OpBrightnessContrast(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         brightness = torch.tensor([1.2]).cuda(self.device_id)
         self.brightness = cvcuda.as_tensor(brightness, "N")
 
diff --git a/bench/python/all_ops/op_cache_limit.py b/bench/python/all_ops/op_cache_limit.py
new file mode 100644
index 000000000..6c49c7add
--- /dev/null
+++ b/bench/python/all_ops/op_cache_limit.py
@@ -0,0 +1,114 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import nvcv
+import torch
+
+# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise
+# things may throw unexpected errors.
+import pycuda.driver as cuda  # noqa: F401
+from bench_utils import AbstractOpBase
+
+# For the following setup depicted in the table, we have to repeatedly call the functions: cudaMalloc and/or
+# cudaFree.
+#
+# ---------------------------------------------------------------------
+# | shape\cache limit  |   small                         large        |
+# |-------------------------------------------------------------------|
+# | non-random         |   cudaMalloc + cudaFree       - (best-case)  |
+# | random             |   cudaMalloc + cudaFree       cudaMalloc     |
+# ---------------------------------------------------------------------
+#
+# Due to the this table, we benchmark three scenarios: {non-random, small}, {non-random, large},
+# {random, large}
+
+
+# Base class for cache limit benchmarks, to ensure all three classes have the same overhead, leading to
+# consistent numbers.
+class BaseOpCacheLimit(AbstractOpBase):
+    def setup(self, input, new_cache_limit, low, high):
+        super().setup(input)
+
+        # make this benchmark compatible with older cvcuda/nvncv versions
+        if hasattr(nvcv, "set_cache_limit_inbytes"):
+            nvcv.set_cache_limit_inbytes(new_cache_limit)
+
+        # We don't have access to the outer benchmark iterations (default=10), so we have to create our own
+        # counter.
+        self.max_iter_outer = 10
+        self.iter_outer = 0
+
+        # Number of "random" tensors created per benchmarked run
+        self.n_tensors = 20
+        self.hw = torch.randint(
+            low=low, high=high, size=(self.max_iter_outer, 2, self.n_tensors)
+        )
+
+    def run(self, input):
+        # If we exceed the outer bench iterations, we return.
+        # If we didn't return, we might re-use the cache, which we specifically don't want for
+        # "OpCacheLimitLargeAndRandom".
+        # For the other classes (OpCacheLimitZero, OpCacheLimitLarge), we could continue running the
+        # benchmarks, but then we would not get comparable numbers between all three classes
+        if self.iter_outer >= self.max_iter_outer:
+            return
+
+        for ii in range(self.n_tensors):
+            shape = (
+                self.hw[self.iter_outer, 0, ii].item(),
+                self.hw[self.iter_outer, 1, ii].item(),
+                3,
+            )
+            _ = nvcv.Tensor(shape, nvcv.Type.F32, nvcv.TensorLayout.HWC)
+
+        self.iter_outer += 1
+        return
+
+
+# This is the {non-random, small} case. The smallest we can choose is 0, so we set the cache limit to 0 and
+# effectively disable the cache
+class OpCacheLimitZero(BaseOpCacheLimit):
+    def setup(self, input):
+        # Set the cache limit to 0 for this benchmark
+        # low=1000, high=1001 results in always creating tensor's of shape (1000,1000,3)
+        super().setup(input, 0, low=1000, high=1001)
+
+    def run(self, input):
+        super().run(input)
+
+
+# This is the {non-random, large} case. This is the best case scenario, always re-using the cache
+class OpCacheLimitLarge(BaseOpCacheLimit):
+    def setup(self, input):
+        # Set the cache limit to the total gpu memory for this benchmark
+        # low=1000, high=1001 results in always creating tensor's of shape (1000,1000,3)
+        total = torch.cuda.mem_get_info()[1]
+        super().setup(input, total, low=1000, high=1001)
+
+    def run(self, input):
+        super().run(input)
+
+
+# This is the {random, large} case. This is the worst case scenario, never re-using the cache
+class OpCacheLimitLargeAndRandom(BaseOpCacheLimit):
+    def setup(self, input):
+        # Set the cache limit to the total gpu memory for this benchmark
+        # low=1000, high=2000 results in always creating tensor's of random shape
+        # between [(1000,1000,3), (1999,1999,3)]
+        total = torch.cuda.mem_get_info()[1]
+        super().setup(input, total, low=1000, high=2000)
+
+    def run(self, input):
+        super().run(input)
diff --git a/bench/python/all_ops/op_centercrop.py b/bench/python/all_ops/op_centercrop.py
index e70e964fc..9de2cb38f 100644
--- a/bench/python/all_ops/op_centercrop.py
+++ b/bench/python/all_ops/op_centercrop.py
@@ -23,6 +23,7 @@
 
 class OpCenterCrop(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         width, height = input.shape[2], input.shape[1]
         self.crop_size = [width // 2, height // 2]
 
diff --git a/bench/python/all_ops/op_composite.py b/bench/python/all_ops/op_composite.py
index 2ef027469..dc306c18e 100644
--- a/bench/python/all_ops/op_composite.py
+++ b/bench/python/all_ops/op_composite.py
@@ -26,6 +26,7 @@
 
 class OpComposite(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         data = read_image(os.path.join(self.assets_dir, "brooklyn.jpg"))
         data = data.moveaxis(0, -1).contiguous()  # From CHW to HWC
         data = data.cuda(self.device_id)
diff --git a/bench/python/all_ops/op_convertto.py b/bench/python/all_ops/op_convertto.py
index 443db43e9..89bce7a21 100644
--- a/bench/python/all_ops/op_convertto.py
+++ b/bench/python/all_ops/op_convertto.py
@@ -24,6 +24,7 @@
 
 class OpConvertTo(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.target_dtype = nvcv.Type.F32
         self.offset = 10.2
         self.scale = 1 / 255.0
diff --git a/bench/python/all_ops/op_copymakeborder.py b/bench/python/all_ops/op_copymakeborder.py
index 39c79d5e8..b8fac51f9 100644
--- a/bench/python/all_ops/op_copymakeborder.py
+++ b/bench/python/all_ops/op_copymakeborder.py
@@ -23,6 +23,7 @@
 
 class OpCopyMakeBorder(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.border_mode = cvcuda.Border.CONSTANT
         self.border_values = [255, 0, 0]  # Border values for 3 channel RGB input.
         self.top = 30
diff --git a/bench/python/all_ops/op_customcrop.py b/bench/python/all_ops/op_customcrop.py
index 4cb31c2fd..91f2133d6 100644
--- a/bench/python/all_ops/op_customcrop.py
+++ b/bench/python/all_ops/op_customcrop.py
@@ -24,6 +24,7 @@
 
 class OpCustomCrop(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.rectI = nvcv.RectI(x=30, y=40, width=420, height=390)
 
     def run(self, input):
diff --git a/bench/python/all_ops/op_cvtcolor.py b/bench/python/all_ops/op_cvtcolor.py
index 35a7322fe..a48a5a2c2 100644
--- a/bench/python/all_ops/op_cvtcolor.py
+++ b/bench/python/all_ops/op_cvtcolor.py
@@ -23,7 +23,7 @@
 
 class OpCvtColorRGB2GRAY(AbstractOpBase):
     def setup(self, input):
-        pass
+        super().setup(input)
 
     def run(self, input):
         return cvcuda.cvtcolor(input, cvcuda.ColorConversion.RGB2GRAY)
@@ -31,7 +31,7 @@ def run(self, input):
 
 class OpCvtColorRGB2BGR(AbstractOpBase):
     def setup(self, input):
-        pass
+        super().setup(input)
 
     def run(self, input):
         return cvcuda.cvtcolor(input, cvcuda.ColorConversion.RGB2BGR)
diff --git a/bench/python/all_ops/op_flip.py b/bench/python/all_ops/op_flip.py
index 0e0a1eb51..9f47c1b01 100644
--- a/bench/python/all_ops/op_flip.py
+++ b/bench/python/all_ops/op_flip.py
@@ -23,6 +23,7 @@
 
 class OpFlipX(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.flip_code = 0  # means flipping around x axis.
 
     def run(self, input):
@@ -31,6 +32,7 @@ def run(self, input):
 
 class OpFlipY(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.flip_code = 1  # means flipping around y axis.
 
     def run(self, input):
@@ -39,6 +41,7 @@ def run(self, input):
 
 class OpFlipXY(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.flip_code = -1  # means flipping around x and y axis.
 
     def run(self, input):
diff --git a/bench/python/all_ops/op_gaussianblur.py b/bench/python/all_ops/op_gaussianblur.py
index 45f57eda3..918caf4b0 100644
--- a/bench/python/all_ops/op_gaussianblur.py
+++ b/bench/python/all_ops/op_gaussianblur.py
@@ -23,6 +23,7 @@
 
 class OpGaussianBlur(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.kernel_size = (3, 3)
         self.sigma = (5, 5)
 
diff --git a/bench/python/all_ops/op_hqresize.py b/bench/python/all_ops/op_hqresize.py
index 765256873..0296eecad 100644
--- a/bench/python/all_ops/op_hqresize.py
+++ b/bench/python/all_ops/op_hqresize.py
@@ -23,6 +23,7 @@
 
 class OpHqResizeDown(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.resize_width = 640
         self.resize_height = 420
 
@@ -39,6 +40,7 @@ def run(self, input):
 
 class OpHqResizeUp(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.resize_width = 1920
         self.resize_height = 1280
 
diff --git a/bench/python/all_ops/op_inpaint.py b/bench/python/all_ops/op_inpaint.py
index 6fa8af553..72da50131 100644
--- a/bench/python/all_ops/op_inpaint.py
+++ b/bench/python/all_ops/op_inpaint.py
@@ -26,6 +26,7 @@
 
 class OpInpaint(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         data = read_image(os.path.join(self.assets_dir, "brooklyn.jpg"))
         mask = read_image(os.path.join(self.assets_dir, "countour_lines.jpg"))
         # Binarize the mask
diff --git a/bench/python/all_ops/op_jointbilateral.py b/bench/python/all_ops/op_jointbilateral.py
index 9b9e694e3..a2a2b74f8 100644
--- a/bench/python/all_ops/op_jointbilateral.py
+++ b/bench/python/all_ops/op_jointbilateral.py
@@ -26,6 +26,7 @@
 
 class OpJointBilateral(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.diameter = 5
         self.sigma_color = 50
         self.sigma_space = 1
diff --git a/bench/python/all_ops/op_laplacian.py b/bench/python/all_ops/op_laplacian.py
index 829268aed..440910629 100644
--- a/bench/python/all_ops/op_laplacian.py
+++ b/bench/python/all_ops/op_laplacian.py
@@ -23,6 +23,7 @@
 
 class OpLaplacian(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.kernel_size = 3
         self.scale = 2.0
 
diff --git a/bench/python/all_ops/op_morphology.py b/bench/python/all_ops/op_morphology.py
index 10744b3d1..674ea549d 100644
--- a/bench/python/all_ops/op_morphology.py
+++ b/bench/python/all_ops/op_morphology.py
@@ -64,6 +64,7 @@ def __call__(self):
 
 class OpMorphologyOpen(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.MorphologyBase = MorphologyBase(
             self.device_id, input, cvcuda.MorphologyType.OPEN
         )
@@ -74,6 +75,7 @@ def run(self, input):
 
 class OpMorphologyClose(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.MorphologyBase = MorphologyBase(
             self.device_id, input, cvcuda.MorphologyType.CLOSE
         )
@@ -84,6 +86,7 @@ def run(self, input):
 
 class OpMorphologyDilate(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.MorphologyBase = MorphologyBase(
             self.device_id, input, cvcuda.MorphologyType.DILATE
         )
@@ -94,6 +97,7 @@ def run(self, input):
 
 class OpMorphologyErode(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.MorphologyBase = MorphologyBase(
             self.device_id, input, cvcuda.MorphologyType.ERODE
         )
diff --git a/bench/python/all_ops/op_nms.py b/bench/python/all_ops/op_nms.py
index c73def6c5..9b72df5b4 100644
--- a/bench/python/all_ops/op_nms.py
+++ b/bench/python/all_ops/op_nms.py
@@ -25,6 +25,7 @@
 
 class OpNMS(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         bboxes = torch.load(
             os.path.join(self.assets_dir, "brooklyn_bboxes.pt"),
             map_location="cuda:%d" % self.device_id,
diff --git a/bench/python/all_ops/op_normalize.py b/bench/python/all_ops/op_normalize.py
index fa7b3eddf..7ebed99e9 100644
--- a/bench/python/all_ops/op_normalize.py
+++ b/bench/python/all_ops/op_normalize.py
@@ -24,6 +24,7 @@
 
 class OpNormalize(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         mean_tensor = (
             torch.Tensor([0.485, 0.456, 0.406]).reshape(1, 1, 1, 3).cuda(self.device_id)
         )
diff --git a/bench/python/all_ops/op_randomresizedcrop.py b/bench/python/all_ops/op_randomresizedcrop.py
index 7da248a42..8ea99d734 100644
--- a/bench/python/all_ops/op_randomresizedcrop.py
+++ b/bench/python/all_ops/op_randomresizedcrop.py
@@ -23,6 +23,7 @@
 
 class OpRandomResizedCrop(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.resized_shape = (input.shape[0], 320, 580, 3)
         self.min_scale = 0.08
         self.max_scale = 1.0
diff --git a/bench/python/all_ops/op_reformat.py b/bench/python/all_ops/op_reformat.py
index 016b4acb7..c1c9bfa20 100644
--- a/bench/python/all_ops/op_reformat.py
+++ b/bench/python/all_ops/op_reformat.py
@@ -23,6 +23,7 @@
 
 class OpReformatNCHWToNHWC(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.input_nchw = cvcuda.reformat(input, "NCHW")
 
     def run(self, input):
@@ -34,7 +35,7 @@ def visualize(self):
 
 class OpReformatNHWCToNCHW(AbstractOpBase):
     def setup(self, input):
-        pass
+        super().setup(input)
 
     def run(self, input):
         return cvcuda.reformat(input, "NCHW")
diff --git a/bench/python/all_ops/op_remap.py b/bench/python/all_ops/op_remap.py
index 9e6b94d29..15fc585e1 100644
--- a/bench/python/all_ops/op_remap.py
+++ b/bench/python/all_ops/op_remap.py
@@ -25,6 +25,7 @@
 
 class OpRemap(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         batch_size, width, height = input.shape[0], input.shape[2], input.shape[1]
         batch_map = np.stack([self.flipH(w=width, h=height) for _ in range(batch_size)])
         batch_map = torch.as_tensor(batch_map, device="cuda")
diff --git a/bench/python/all_ops/op_reshape.py b/bench/python/all_ops/op_reshape.py
index 438aa1af6..d4d01e4c1 100644
--- a/bench/python/all_ops/op_reshape.py
+++ b/bench/python/all_ops/op_reshape.py
@@ -23,6 +23,7 @@
 
 class OpReshape(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.shape = input.shape[::-1]  # Reverse everything out
 
     def run(self, input):
diff --git a/bench/python/all_ops/op_resize.py b/bench/python/all_ops/op_resize.py
index b60abe92f..08046da53 100644
--- a/bench/python/all_ops/op_resize.py
+++ b/bench/python/all_ops/op_resize.py
@@ -23,6 +23,7 @@
 
 class OpResizeDown(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.resize_width = 640
         self.resize_height = 420
 
@@ -41,6 +42,7 @@ def run(self, input):
 
 class OpResizeUp(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.resize_width = 1920
         self.resize_height = 1280
 
diff --git a/bench/python/all_ops/op_resize_crop_convert_reformat.py b/bench/python/all_ops/op_resize_crop_convert_reformat.py
index 32dccfbc7..52d0083ec 100644
--- a/bench/python/all_ops/op_resize_crop_convert_reformat.py
+++ b/bench/python/all_ops/op_resize_crop_convert_reformat.py
@@ -24,6 +24,7 @@
 
 class OpResizeCropConvertReformat(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         resize = 256
         crop = 224
         delta_shape = resize - crop
diff --git a/bench/python/all_ops/op_rotate.py b/bench/python/all_ops/op_rotate.py
index 9681f67e1..f542c0da9 100644
--- a/bench/python/all_ops/op_rotate.py
+++ b/bench/python/all_ops/op_rotate.py
@@ -23,6 +23,7 @@
 
 class OpRotate(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.angle_deg = 40
         self.shift = [input.shape[2] // 4, input.shape[1] // 4]
         self.interpolation_type = cvcuda.Interp.LINEAR
diff --git a/bench/python/all_ops/op_sift.py b/bench/python/all_ops/op_sift.py
index 724cdd2af..98e61ecae 100644
--- a/bench/python/all_ops/op_sift.py
+++ b/bench/python/all_ops/op_sift.py
@@ -23,6 +23,7 @@
 
 class OpSIFT(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.max_features = 100
         self.num_octave_layers = 3
         self.contrast_threshold = 0.04
diff --git a/bench/python/all_ops/op_threshold.py b/bench/python/all_ops/op_threshold.py
index 70208114b..300f9c9f9 100644
--- a/bench/python/all_ops/op_threshold.py
+++ b/bench/python/all_ops/op_threshold.py
@@ -24,6 +24,7 @@
 
 class OpThreshold(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         threshold = torch.tensor([150.0] * input.shape[0])
         threshold = threshold.type(torch.float64)
         threshold = threshold.cuda(self.device_id)
diff --git a/bench/python/all_ops/op_warpaffine.py b/bench/python/all_ops/op_warpaffine.py
index 5130e3953..a983c4aa3 100644
--- a/bench/python/all_ops/op_warpaffine.py
+++ b/bench/python/all_ops/op_warpaffine.py
@@ -24,6 +24,7 @@
 
 class OpWarpAffine(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.xform = np.array(
             [[1.26666667, 0.6, -83.33333333], [-0.33333333, 1.0, 66.66666667]]
         )
@@ -43,6 +44,7 @@ def run(self, input):
 
 class OpWarpAffineInverse(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.xform = np.array(
             [[1.26666667, 0.6, -83.33333333], [-0.33333333, 1.0, 66.66666667]]
         )
diff --git a/bench/python/all_ops/op_warpperspective.py b/bench/python/all_ops/op_warpperspective.py
index af3bced1f..528b82df6 100644
--- a/bench/python/all_ops/op_warpperspective.py
+++ b/bench/python/all_ops/op_warpperspective.py
@@ -24,6 +24,7 @@
 
 class OpWarpPerspective(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.xform = np.array(
             [
                 [3.46153846e-01, 3.33031674e-01, 1.28000000e02],
@@ -48,6 +49,7 @@ def run(self, input):
 
 class OpWarpPerspectiveInverse(AbstractOpBase):
     def setup(self, input):
+        super().setup(input)
         self.xform = np.array(
             [
                 [3.46153846e-01, 3.33031674e-01, 1.28000000e02],
diff --git a/bench/python/bench_utils.py b/bench/python/bench_utils.py
index 36874890b..c0b194544 100644
--- a/bench/python/bench_utils.py
+++ b/bench/python/bench_utils.py
@@ -30,6 +30,7 @@
 import numpy as np
 import json
 import pandas
+import nvcv
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(
@@ -96,7 +97,8 @@ def setup(self, input):
         Performs various setup activities to set this operator before it can be run.
         :param input: The input tensor to run the operator on.
         """
-        pass
+        if hasattr(nvcv, "clear_cache"):
+            nvcv.clear_cache()
 
     @abstractmethod
     def run(self, input):
diff --git a/bench/python/run_bench.py b/bench/python/run_bench.py
index 9bab6c1e7..2ca39326d 100644
--- a/bench/python/run_bench.py
+++ b/bench/python/run_bench.py
@@ -20,6 +20,7 @@
 import sys
 import logging
 import cvcuda
+import nvcv
 import torch
 
 from pathlib import Path
@@ -134,6 +135,7 @@ def run_bench(
                     output_dir=output_dir,
                     should_visualize=should_visualize,
                 )
+                torch.cuda.current_stream().synchronize()
                 cvcuda_perf.pop_range()  # For init_op
             except Exception as e:
                 logger.error(
@@ -164,6 +166,11 @@ def run_bench(
                     break
 
             cvcuda_perf.pop_range(delete_range=not success)  # For the run_op
+            # reset the cache limit to not affect other operator benchmarks, in case a benchmark test
+            # changed it
+            if hasattr(nvcv, "set_cache_limit_inbytes"):
+                total = torch.cuda.mem_get_info()[1]
+                nvcv.set_cache_limit_inbytes(total // 2)
 
             # Step 3: log the parameters used by the operator, initialized during the setup call.
             if success:
diff --git a/cmake/PrintConfig.cmake b/cmake/PrintConfig.cmake
index 4c58c7e89..728dfdaa3 100644
--- a/cmake/PrintConfig.cmake
+++ b/cmake/PrintConfig.cmake
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -55,6 +55,12 @@ else()
     message(STATUS "    BUILD_BENCH              : off")
 endif()
 
+if(BUILD_DOCS)
+    message(STATUS "    BUILD_DOCS               : ON")
+else()
+    message(STATUS "    BUILD_DOCS               : off")
+endif()
+
 if(ENABLE_TEGRA)
     message(STATUS "    ENABLE_TEGRA             : ON")
 else()
diff --git a/docker/build20.04/Dockerfile b/docker/build20.04/Dockerfile
index d99223968..8db65d0d1 100644
--- a/docker/build20.04/Dockerfile
+++ b/docker/build20.04/Dockerfile
@@ -17,6 +17,7 @@
 
 ARG VER_CUDA=?
 ARG VER_UBUNTU=?
+ARG VER_NUMPY=?
 
 FROM nvidia/cuda:$VER_CUDA-devel-ubuntu$VER_UBUNTU
 
@@ -63,7 +64,7 @@ RUN python3 -m pip install pre-commit
 RUN python3 -m pip install sphinx-rtd-theme sphinx==4.5.0
 RUN python3 -m pip install breathe exhale recommonmark graphviz
 # Needed for python sphinx docs and Python wheels
-RUN python3 -m pip install numpy==1.24.1 patchelf==0.17.2.1
+RUN python3 -m pip install numpy patchelf==0.17.2.1
 
 # Python bindings ======================================
 
@@ -71,7 +72,7 @@ RUN python3 -m pip install numpy==1.24.1 patchelf==0.17.2.1
 ADD deadsnakes-ubuntu-ppa-focal.list /etc/apt/sources.list.d
 RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys BA6932366A755776
 
-RUN for PYTHON_VERSION in 3.7 3.8 3.9 3.10 3.11; do \
+RUN for PYTHON_VERSION in 3.8 3.9 3.10 3.11; do \
     apt-get update \
     && apt-get install -y --no-install-recommends \
     python$PYTHON_VERSION-dev python$PYTHON_VERSION-distutils; \
diff --git a/docker/build22.04/Dockerfile b/docker/build22.04/Dockerfile
index 216860511..84cfc2453 100644
--- a/docker/build22.04/Dockerfile
+++ b/docker/build22.04/Dockerfile
@@ -17,6 +17,7 @@
 
 ARG VER_CUDA=?
 ARG VER_UBUNTU=?
+ARG VER_NUMPY=?
 
 FROM nvidia/cuda:$VER_CUDA-devel-ubuntu$VER_UBUNTU
 
@@ -59,7 +60,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 RUN python3 -m pip install sphinx-rtd-theme sphinx==4.5.0
 RUN python3 -m pip install breathe exhale recommonmark graphviz
 # Needed for python sphinx docs and Python wheels
-RUN python3 -m pip install numpy==1.24.1 patchelf==0.17.2.1
+RUN python3 -m pip install numpy patchelf==0.17.2.1
 
 # Python bindings ======================================
 
@@ -72,7 +73,6 @@ RUN ln -sf /usr/share/zoneinfo/US/Pacific /etc/localtime
 
 RUN apt-get update \
     && apt-get install -y --no-install-recommends \
-    python3.7-dev python3.7-distutils \
     python3.8-dev python3.8-distutils \
     python3.9-dev python3.9-distutils \
     python3.10-dev python3.10-distutils \
diff --git a/docker/config b/docker/config
index d597b477b..85e3a6f9e 100644
--- a/docker/config
+++ b/docker/config
@@ -29,3 +29,4 @@ TAG_IMAGE_TEST=5
 VER_CUDA=11.7.1
 VER_UBUNTU=20.04
 VER_TRT=24.01
+VER_NUMPY=1.24.1
diff --git a/docker/devel20.04/Dockerfile b/docker/devel20.04/Dockerfile
index d71475ded..0865b5014 100644
--- a/docker/devel20.04/Dockerfile
+++ b/docker/devel20.04/Dockerfile
@@ -17,6 +17,7 @@
 
 ARG BASE_IMAGE=?
 ARG TAG_IMAGE=?
+ARG VER_NUMPY=?
 
 FROM $BASE_IMAGE:$TAG_IMAGE
 
@@ -50,14 +51,19 @@ RUN apt-get update \
     && rm -rf /var/lib/apt/lists/*
 
 # needed by tests
-RUN for PYTHON_VERSION in 3.7 3.8 3.9 3.10 3.11; do \
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.8 && \
+    python3.8 -m pip install --upgrade pip && \
+    python3.8 -m pip install --upgrade \
+    pytest torch==2.4.0 numpy typing-extensions && \
+    rm -rf /root/.cache/pip;
+
+# Debug: Print the VER_NUMPY value to ensure it is passed correctly
+RUN for PYTHON_VERSION in 3.9 3.10 3.11; do \
     curl -sS https://bootstrap.pypa.io/get-pip.py | python$PYTHON_VERSION && \
     python$PYTHON_VERSION -m pip install --upgrade pip && \
     python$PYTHON_VERSION -m pip install --upgrade \
-    pytest torch==1.13.0 numpy typing-extensions && \
+    pytest torch==2.4.0 numpy==$VER_NUMPY typing-extensions && \
     rm -rf /root/.cache/pip; \
     done
 
-
-
 WORKDIR /cvcuda
diff --git a/docker/devel22.04/Dockerfile b/docker/devel22.04/Dockerfile
index 51f9c9abe..7c660024f 100644
--- a/docker/devel22.04/Dockerfile
+++ b/docker/devel22.04/Dockerfile
@@ -17,6 +17,7 @@
 
 ARG BASE_IMAGE=?
 ARG TAG_IMAGE=?
+ARG VER_NUMPY=?
 
 FROM $BASE_IMAGE:$TAG_IMAGE
 
@@ -59,14 +60,17 @@ RUN apt-get update \
     && rm -rf /var/lib/apt/lists/*
 
 # needed by tests (python3 is python3.10 in ubuntu22.04)
-RUN python3 -m pip install torch==1.13.0 torchvision cupy-cuda11x \
+RUN python3 -m pip install torch==2.4.0 torchvision cupy-cuda11x \
+    numpy==$VER_NUMPY \
     && rm -rf /root/.cache/pip
-RUN python3.8 -m pip install torch==1.13.0 torchvision cupy-cuda11x \
-    numpy sphinx-rtd-theme sphinx breathe exhale recommonmark graphviz \
+RUN python3.8 -m pip install torch==2.4.0 torchvision cupy-cuda11x \
+    numpy sphinx-rtd-theme sphinx breathe exhale recommonmark \
+    graphviz && rm -rf /root/.cache/pip
+RUN python3.9 -m pip install pytest torch==2.4.0 torchvision cupy-cuda11x \
+    numpy==$VER_NUMPY \
     && rm -rf /root/.cache/pip
-RUN python3.9 -m pip install pytest torch==1.13.0 torchvision cupy-cuda11x \
-    && rm -rf /root/.cache/pip
-RUN python3.11 -m pip install --upgrade pytest torch==1.13.0 cupy-cuda11x \
+RUN python3.11 -m pip install --upgrade pytest torch==2.4.0 cupy-cuda11x \
+    numpy==$VER_NUMPY \
     && rm -rf /root/.cache/pip
 
 WORKDIR /cvcuda
diff --git a/docker/env_devel_linux.sh b/docker/env_devel_linux.sh
index f031be358..106fa1dcd 100755
--- a/docker/env_devel_linux.sh
+++ b/docker/env_devel_linux.sh
@@ -70,5 +70,5 @@ docker run --gpus=all --net=host --pull always -ti \
     -v /var/tmp:/var/tmp \
     -v $SDIR/..:$HOME/cvcuda \
     $extra_args \
-    $IMAGE_URL_BASE/devel-linux:$VER_UBUNTU-$VER_CUDA \
+    $IMAGE_URL_BASE/devel-linux:$VER_UBUNTU-$VER_CUDA-$VER_NUMPY \
     /usr/bin/bash -c "mkdir -p $HOME && chown $USER:$USER $HOME && su - $USER -c \"$extra_cmds\" && su - $USER"
diff --git a/docker/update_build_image.sh b/docker/update_build_image.sh
index 2a5cb4494..28e6e9bcd 100755
--- a/docker/update_build_image.sh
+++ b/docker/update_build_image.sh
@@ -37,11 +37,12 @@ cd "$SDIR"
 
 cd build$VER_UBUNTU
 
-image=$IMAGE_URL_BASE/build-linux:$VER_UBUNTU-$VER_CUDA
+image=$IMAGE_URL_BASE/build-linux:$VER_UBUNTU-$VER_CUDA-$VER_NUMPY
 
 docker build --network=host \
     --build-arg "VER_CUDA=$VER_CUDA" \
     --build-arg "VER_UBUNTU=$VER_UBUNTU" \
+    --build-arg "VER_NUMPY=$VER_NUMPY" \
     . -t "$image"
 
 if [[ $do_push == 1 ]]; then
diff --git a/docker/update_devel_image.sh b/docker/update_devel_image.sh
index 1bc93a10b..65bd34fc6 100755
--- a/docker/update_devel_image.sh
+++ b/docker/update_devel_image.sh
@@ -35,11 +35,12 @@ cd "$SDIR"
 
 cd devel$VER_UBUNTU
 
-image=$IMAGE_URL_BASE/devel-linux:$VER_UBUNTU-$VER_CUDA
+image=$IMAGE_URL_BASE/devel-linux:$VER_UBUNTU-$VER_CUDA-$VER_NUMPY
 
 docker build --network=host \
-    --build-arg BASE_IMAGE=$IMAGE_URL_BASE/build-linux \
-    --build-arg TAG_IMAGE=$VER_UBUNTU-$VER_CUDA \
+    --build-arg "BASE_IMAGE=$IMAGE_URL_BASE/build-linux" \
+    --build-arg "TAG_IMAGE=$VER_UBUNTU-$VER_CUDA-$VER_NUMPY" \
+    --build-arg "VER_NUMPY=$VER_NUMPY" \
     . -t $image
 
 if [[ $do_push == 1 ]]; then
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
index 5ad4f5973..a43037c50 100644
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -52,8 +52,12 @@ set(C_CPP_API_RST ${SPHINX_SOURCE}/_c_cpp_api)
 set(PY_CVCUDA_API_RST ${SPHINX_SOURCE}/_python_api/_cvcuda_api)
 
 # Start from clean directory for rst files, otherwise build could be affected due to old files
-file(REMOVE_RECURSE ${C_CPP_API_RST}/*)
-file(REMOVE_RECURSE ${PY_CVCUDA_API_RST}/*)
+if(EXISTS ${C_CPP_API_RST})
+        file(REMOVE_RECURSE ${C_CPP_API_RST})
+endif()
+if(EXISTS ${PY_CVCUDA_API_RST})
+        file(REMOVE_RECURSE ${PY_CVCUDA_API_RST})
+endif()
 
 # Generate rst files for groups from doxygen index.xml
 add_custom_target(cvcuda_groups ALL python3 ${SPHINX_SOURCE}/generate_groups.py ${C_CPP_API_RST} ${DOXYGEN_OUTPUT_DIR}/xml
@@ -74,6 +78,7 @@ add_custom_command(OUTPUT ${SPHINX_INDEX_FILE}
                     DEPENDS ${SPHINX_SOURCE}/index.rst
                             cvcuda_doxygen
 		            cvcuda_groups
+                            cvcuda_python_docs
                     MAIN_DEPENDENCY ${SPHINX_SOURCE}/conf.py
                     COMMENT "Generating documentation with Sphinx")
 
diff --git a/docs/sphinx/gen_py_doc_rsts.py b/docs/sphinx/gen_py_doc_rsts.py
deleted file mode 100644
index 320f97600..000000000
--- a/docs/sphinx/gen_py_doc_rsts.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import re
-import sys
-from typing import List, Tuple
-
-
-def exports_enum(s: str) -> bool:
-    return s.lstrip().startswith("py::enum_<")
-
-
-def get_name_of_enum(s: str) -> str:
-    """Name of enum is first string in line"""
-    return re.findall('"([^"]*)"', s)[0]
-
-
-def exports_class(s: str) -> bool:
-    return s.lstrip().startswith("py::class_<")
-
-
-def get_name_of_class_if_documented(s: str) -> Tuple[bool, str]:
-    """
-    If a class has only one strings in line, it has no documentation to be exported.
-    If it has more than one string, it has doc and first string is the title of the class
-    """
-    found_strings = re.findall('"([^"]*)"', s)  # get all strings
-    if len(found_strings) > 1:
-        return True, found_strings[0]
-    else:
-        return False, ""
-
-
-def exports_def(s: str) -> bool:
-    return s.lstrip().startswith("m.def(")
-
-
-def get_name_of_def(s: str) -> str:
-    """Name of def is first string in line"""
-    return re.findall('"([^"]*)"', s)[0]
-
-
-def has_exports(file_path: str, export_calls: List[str]) -> bool:
-    with open(file_path, "r") as file_str:
-        file_str_read = file_str.read()
-        for call in export_calls:
-            if call in file_str_read:
-                export_calls.remove(call)
-                return True
-    return False
-
-
-def create_rst_text(template_file: str, name: str, module: str, members: str) -> str:
-    with open(template_file, "r") as f:
-        rst_text = f.read()
-    rst_text = rst_text.replace("@OperatorName@", name)
-    rst_text = rst_text.replace("@=@", "=" * len(name))
-    rst_text = rst_text.replace("@Module@", module)
-    rst_text = rst_text.replace("@MemberFunctions@", members)
-    return rst_text
-
-
-def create_cvcuda_operator_rst_files(
-    cvcuda_path: str, outdir: str, python_cvcuda_root: str, export_calls: List[str]
-) -> None:
-    # search for template rst file
-    template_rst_file_path = os.path.join(
-        cvcuda_path, "docs", "sphinx", "_python_api", "template.rst"
-    )
-    if not os.path.isfile(template_rst_file_path):
-        raise FileNotFoundError(f"File {template_rst_file_path} not found")
-
-    # iterate through all files
-    for i in sorted(os.listdir(python_cvcuda_root)):
-        op_file_path = os.path.join(python_cvcuda_root, i)
-        # Only work on .cpp files that export operators
-        if (
-            os.path.isfile(op_file_path)
-            and i.endswith(".cpp")
-            and i != "Main.cpp"
-            and has_exports(op_file_path, export_calls)
-        ):
-
-            # Get operator name form .cpp file: remove prefix "Op" and file type
-            operator_name = os.path.splitext(i)[0]
-            operator_name = operator_name[len("Op") :]  # noqa: E203
-
-            # Look for functions to add to documentation
-            # search for all lines that start with "m.def(" (stripping leading white spaces)
-            # then pick first string of that line, this is the name of the python function to be exported
-            exports = set()
-            with open(op_file_path, "r") as fp:
-                for line in fp:
-                    if exports_def(line):
-                        exports.add(get_name_of_def(line))
-            if len(exports) == 0:
-                raise RuntimeError(f"No exports found in file {op_file_path}")
-            exports_str = ", ".join(exports)
-
-            # Create text to put into rst file - starting from a template
-            rst_text = create_rst_text(
-                template_rst_file_path, operator_name, "cvcuda", exports_str
-            )
-
-            # Write rst file: outdir/_op_<operatorname>.rst
-            outfile = os.path.join(outdir, f"_op_{operator_name.lower()}.rst")
-            with open(outfile, "w") as f:
-                f.write(rst_text)
-    return
-
-
-def create_cvcuda_non_operator_rst_files(
-    cvcuda_path: str, outdir: str, python_cvcuda_root: str, export_calls: List[str]
-) -> None:
-    # search for template rst file
-    template_rst_file_path = os.path.join(
-        cvcuda_path, "docs", "sphinx", "_python_api", "template.rst"
-    )
-    if not os.path.isfile(template_rst_file_path):
-        raise FileNotFoundError(f"File {template_rst_file_path} not found")
-
-    for i in sorted(os.listdir(python_cvcuda_root)):
-        nonop_file_path = os.path.join(python_cvcuda_root, i)
-        # Only work on .cpp files that something different than operators
-        if (
-            os.path.isfile(nonop_file_path)
-            and i.endswith(".cpp")
-            and i != "Main.cpp"
-            and has_exports(nonop_file_path, export_calls)
-        ):
-            # Look for functions to add to documentation
-            # Search for all lines that start with "py::enum_<" or "py::class_<"
-            with open(nonop_file_path, "r") as fp:
-                for line in fp:
-                    if exports_enum(line):
-                        export = get_name_of_enum(line)
-                    elif exports_class(line):
-                        has_doc, name = get_name_of_class_if_documented(line)
-                        if has_doc:
-                            export = name
-                        else:
-                            continue
-                    else:
-                        continue
-
-                    # Create text to put into rst file - starting from a template
-                    rst_text = create_rst_text(
-                        template_rst_file_path, export, "cvcuda", export
-                    )
-
-                    # Write rst file: outdir/_aux_<export>.rst
-                    outfile = os.path.join(outdir, f"_aux_{export.lower()}.rst")
-                    with open(outfile, "w") as f:
-                        f.write(rst_text)
-    return
-
-
-def export_found(s: str) -> bool:
-    return s.lstrip().startswith("Export")
-
-
-def get_export_fun_name(s: str) -> str:
-    return s.lstrip().split("(", 1)[0]
-
-
-def exporting_nonops(s: str) -> bool:
-    """Everything after that command exports auxiliary operator entities
-    (non-operators)"""
-    return s.lstrip().startswith("// doctag: Non-Operators")
-
-
-def exporting_ops(s: str) -> bool:
-    """Everything after that command exports operators"""
-    return s.lstrip().startswith("// doctag: Operators")
-
-
-def get_exported_cvcuda(path_to_main: str):
-    export_nonop = []  # list for non operators
-    export_op = []  # list for operators
-    exports = None
-    with open(path_to_main, "r") as fp:
-        for line in fp:
-            if export_found(line):
-                # remove everything after first "("
-                name = get_export_fun_name(line)
-                try:
-                    exports.append(name)
-                except AttributeError:
-                    print(
-                        "No comment '// doctag: Non-Operators' or '// doctag: Operators' was found in "
-                        f"{path_to_main} prior to 'Export*(m);'-routines."
-                    )
-                    sys.exit()
-            elif exporting_nonops(line):
-                exports = export_nonop
-            elif exporting_ops(line):
-                exports = export_op
-    assert len(export_nonop) > 0 and len(export_op) > 0
-    return export_nonop, export_op
-
-
-def generate_py_doc_rsts_cvcuda(cvcuda_path: str, outdir: str):
-    python_cvcuda_root = os.path.join(cvcuda_path, "python", "mod_cvcuda")
-    export_nonop, export_op = get_exported_cvcuda(
-        os.path.join(python_cvcuda_root, "Main.cpp")
-    )
-    create_cvcuda_operator_rst_files(cvcuda_path, outdir, python_cvcuda_root, export_op)
-    create_cvcuda_non_operator_rst_files(
-        cvcuda_path, outdir, python_cvcuda_root, export_nonop
-    )
-    return
-
-
-if __name__ == "__main__":
-    outdir = sys.argv[1]  # path/to/cvcuda/docs/sphinx/_python_api/_cvcuda_api
-    cvcuda_path = sys.argv[2]  # path/to/cvcuda
-    os.makedirs(outdir, exist_ok=True)
-    generate_py_doc_rsts_cvcuda(cvcuda_path, outdir)
diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst
index f228f771e..66f03f3f8 100644
--- a/docs/sphinx/index.rst
+++ b/docs/sphinx/index.rst
@@ -123,7 +123,7 @@ Copyright
     :maxdepth: 1
     :hidden:
 
-    v0.10.1-beta <relnotes/v0.10.1-beta>
+    v0.11.0-beta <relnotes/v0.11.0-beta>
     v0.10.0-beta <relnotes/v0.10.0-beta>
     v0.9.0-beta <relnotes/v0.9.0-beta>
     v0.8.0-beta <relnotes/v0.8.0-beta>
diff --git a/docs/sphinx/modules/c_algos.rst b/docs/sphinx/modules/c_algos.rst
index 9164df2c5..8588d7c04 100644
--- a/docs/sphinx/modules/c_algos.rst
+++ b/docs/sphinx/modules/c_algos.rst
@@ -14,10 +14,10 @@
   # See the License for the specific language governing permissions and
   # limitations under the License.
 
-Algorithms
-==========
+OPERATORS
+=========
 
-CV-CUDA Algorithms
+CV-CUDA Operators
 
 .. toctree::
    :glob:
diff --git a/docs/sphinx/modules/c_modules.rst b/docs/sphinx/modules/c_modules.rst
index 6d0949a65..80cd189a5 100644
--- a/docs/sphinx/modules/c_modules.rst
+++ b/docs/sphinx/modules/c_modules.rst
@@ -1,5 +1,5 @@
 ..
-  # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+  # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
   # SPDX-License-Identifier: Apache-2.0
   #
   # Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,4 +21,4 @@ C API
 
    Core <c_core>
    Utilities <c_utils>
-   Algorithms <c_algos>
+   Operators <c_algos>
diff --git a/docs/sphinx/modules/cpp_algos.rst b/docs/sphinx/modules/cpp_algos.rst
index c10ae2051..8588d7c04 100644
--- a/docs/sphinx/modules/cpp_algos.rst
+++ b/docs/sphinx/modules/cpp_algos.rst
@@ -14,10 +14,10 @@
   # See the License for the specific language governing permissions and
   # limitations under the License.
 
-ALGORITHMS
-==========
+OPERATORS
+=========
 
-CV-CUDA Algorithms
+CV-CUDA Operators
 
 .. toctree::
    :glob:
diff --git a/docs/sphinx/modules/cpp_modules.rst b/docs/sphinx/modules/cpp_modules.rst
index 413d08a8d..4251c0235 100644
--- a/docs/sphinx/modules/cpp_modules.rst
+++ b/docs/sphinx/modules/cpp_modules.rst
@@ -21,5 +21,5 @@ C++ API
 
    Core <cpp_core>
    Utilities <cpp_utils>
-   Algorithms <cpp_algos>
+   Operators <cpp_algos>
    CUDA Tools <cpp_cudatools>
diff --git a/docs/sphinx/modules/python_modules.rst b/docs/sphinx/modules/python_modules.rst
index b76f8420b..667313cc3 100644
--- a/docs/sphinx/modules/python_modules.rst
+++ b/docs/sphinx/modules/python_modules.rst
@@ -22,4 +22,5 @@ Python API
   :maxdepth: 3
 
   Core <python_core>
-  Algorithms <python_algos>
+  Operators <python_ops>
+  Operator's auxiliary entities <python_ops_aux>
diff --git a/docs/sphinx/modules/python_algos.rst b/docs/sphinx/modules/python_ops.rst
similarity index 88%
rename from docs/sphinx/modules/python_algos.rst
rename to docs/sphinx/modules/python_ops.rst
index bdeb5d6c3..7769d72dd 100644
--- a/docs/sphinx/modules/python_algos.rst
+++ b/docs/sphinx/modules/python_ops.rst
@@ -14,13 +14,12 @@
   # See the License for the specific language governing permissions and
   # limitations under the License.
 
-ALGORITHMS
-==========
+OPERATORS
+=========
 
-Algorithms for the NVIDIA® CV-CUDA library.
+Operators for the NVIDIA® CV-CUDA library.
 
 .. toctree::
   :glob:
 
   ../_python_api/_cvcuda_api/_op_*
-  ../_python_api/_cvcuda_api/_aux_*
diff --git a/docs/sphinx/modules/python_ops_aux.rst b/docs/sphinx/modules/python_ops_aux.rst
new file mode 100644
index 000000000..7f6c3ff8b
--- /dev/null
+++ b/docs/sphinx/modules/python_ops_aux.rst
@@ -0,0 +1,25 @@
+..
+  # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+  # SPDX-License-Identifier: Apache-2.0
+  #
+  # Licensed under the Apache License, Version 2.0 (the "License");
+  # you may not use this file except in compliance with the License.
+  # You may obtain a copy of the License at
+  #
+  # http://www.apache.org/licenses/LICENSE-2.0
+  #
+  # Unless required by applicable law or agreed to in writing, software
+  # distributed under the License is distributed on an "AS IS" BASIS,
+  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  # See the License for the specific language governing permissions and
+  # limitations under the License.
+
+OPERATOR'S AUXILIARY ENTITIES
+=============================
+
+Operator's auxiliary entities for the NVIDIA® CV-CUDA library.
+
+.. toctree::
+  :glob:
+
+  ../_python_api/_cvcuda_api/_aux_*
diff --git a/docs/sphinx/relnotes/v0.10.0-beta.rst b/docs/sphinx/relnotes/v0.10.0-beta.rst
index 7f446dd77..17488f9dc 100644
--- a/docs/sphinx/relnotes/v0.10.0-beta.rst
+++ b/docs/sphinx/relnotes/v0.10.0-beta.rst
@@ -26,14 +26,14 @@ CV-CUDA v0.10.0 includes a critical bug fix (cache growth management) alongside
 
 * **New Features**:
 
-  * Added mechanism to limit and manage cache memory consumption (includes new "Best Practices" documentation).
+  * Added mechanism to limit and manage cache memory consumption (includes new "Best Practices" documentation) [1]_.
   * Performance improvements of color conversion operators (e.g., 2x faster RGB2YUV).
   * Refactored codebase to allow independent build of NVCV library (data structures).
 
 * **Bug Fixes**:
 
-  * Fixed unbounded cache memory consumption issue.
-  * Improved management of Python-created object lifetimes, decoupled from cache management.
+  * Fixed unbounded cache memory consumption issue [1]_.
+  * Improved management of Python-created object lifetimes, decoupled from cache management [1]_.
   * Fixed potential crash in Resize operator's linear and nearest neighbor interpolation from non-aligned vectorized writes.
   * Fixed Python CvtColor operator to correctly handle NV12 and NV21 outputs.
   * Fixed Resize and RandomResizedCrop linear interpolation weight for border rows and columns.
@@ -68,3 +68,5 @@ Acknowledgements
 ----------------
 
 CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team.
+
+.. [1] These fixes and features add micro-second-level overhead to Python operator calls. Based on the performance analysis of our Python samples, we expect the production- and pipeline-level impact to be negligible. CUDA kernel and C++ call performance is not affected. We aim to investigate and reduce this overhead further in a future release.​
diff --git a/docs/sphinx/relnotes/v0.11.0-beta.rst b/docs/sphinx/relnotes/v0.11.0-beta.rst
new file mode 100644
index 000000000..9f50e975c
--- /dev/null
+++ b/docs/sphinx/relnotes/v0.11.0-beta.rst
@@ -0,0 +1,84 @@
+..
+  # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+  # SPDX-License-Identifier: Apache-2.0
+  #
+  # Licensed under the Apache License, Version 2.0 (the "License");
+  # you may not use this file except in compliance with the License.
+  # You may obtain a copy of the License at
+  #
+  # http://www.apache.org/licenses/LICENSE-2.0
+  #
+  # Unless required by applicable law or agreed to in writing, software
+  # distributed under the License is distributed on an "AS IS" BASIS,
+  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  # See the License for the specific language governing permissions and
+  # limitations under the License.
+
+.. _v0.11.0-beta:
+
+v0.11.0-beta
+============
+
+Release Highlights
+------------------
+
+CV-CUDA v0.11.0 includes critical bug fixes alongside the following changes:​
+
+* **New Features**:​
+
+  * Enable NVCV to be built as static library​
+  * Improve Python doc generation and structure ​
+
+* **Bug Fixes**:​
+
+  * Update pybind11 2.10.0 to 2.13.1. Fixes rare race conditions with Python garbage collector, adds compatibility with numpy2​
+
+Compatibility and Known Limitations
+-----------------------------------
+  
+* **Pre-existing limitations**:
+
+  * The CvtColor operator incorrectly computes the data location of the second chromaticity channel for conversions that involve YUV(420) semi-planar formats. This issue persists through the current release and we intend to address this bug in CV-CUDA v0.12. We do not recommend using these formats.​
+
+    * Known affected formats:​
+      * NVCV_COLOR_YUV2RGB_I420​
+      * NVCV_COLOR_RGB2YUV_I420​
+      * NVCV_COLOR_YUV2BGR_I420​
+      * NVCV_COLOR_BGR2YUV_I420​
+      * NVCV_COLOR_YUV2RGBA_I420​
+      * NVCV_COLOR_RGBA2YUV_I420​
+      * NVCV_COLOR_YUV2BGRA_I420​
+      * NVCV_COLOR_BGRA2YUV_I420​
+      * NVCV_COLOR_RGB2YUV_I420​
+      * NVCV_COLOR_YUV2RGB_YV12​
+      * NVCV_COLOR_RGB2YUV_YV12​
+      * NVCV_COLOR_YUV2BGR_YV12​
+      * NVCV_COLOR_BGR2YUV_YV12​
+      * NVCV_COLOR_YUV2RGBA_YV12​
+      * NVCV_COLOR_RGBA2YUV_YV12​
+      * NVCV_COLOR_YUV2BGRA_YV12​
+      * NVCV_COLOR_BGRA2YUV_YV12​
+      * NVCV_COLOR_RGB2YUV_YV12​
+      * NVCV_COLOR_YUV2GRAY_420​
+
+For the full list, see main README on `CV-CUDA GitHub <https://github.com/CVCUDA/CV-CUDA>`_.
+
+License
+-------
+
+CV-CUDA is licensed under the `Apache 2.0 <https://github.com/CVCUDA/CV-CUDA/blob/main/LICENSE.md>`_ license.
+
+Resources
+---------
+
+1. `CV-CUDA GitHub <https://github.com/CVCUDA/CV-CUDA>`_
+2. `CV-CUDA Increasing Throughput and Reducing Costs for AI-Based Computer Vision with CV-CUDA <https://developer.nvidia.com/blog/increasing-throughput-and-reducing-costs-for-computer-vision-with-cv-cuda/>`_
+3. `NVIDIA Announces Microsoft, Tencent, Baidu Adopting CV-CUDA for Computer Vision AI <https://blogs.nvidia.com/blog/2023/03/21/cv-cuda-ai-computer-vision/>`_
+4. `CV-CUDA helps Tencent Cloud audio and video PaaS platform achieve full-process GPU acceleration for video enhancement AI <https://developer.nvidia.com/zh-cn/blog/cv-cuda-high-performance-image-processing/>`_
+
+Acknowledgements
+----------------
+
+CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team.
+
+.. [1] These fixes and features add micro-second-level overhead to Python operator calls. Based on the performance analysis of our Python samples, we expect the production- and pipeline-level impact to be negligible. CUDA kernel and C++ call performance is not affected. We aim to investigate and reduce this overhead further in a future release.​
diff --git a/python/mod_cvcuda/CvtColorUtil.cpp b/python/mod_cvcuda/CvtColorUtil.cpp
index 5d4df214e..5f80c08bf 100644
--- a/python/mod_cvcuda/CvtColorUtil.cpp
+++ b/python/mod_cvcuda/CvtColorUtil.cpp
@@ -296,7 +296,7 @@ int64_t GetOutputHeight(int64_t height, NVCVColorConversionCode code)
     }
 }
 
-nvcv::TensorShape GetOutputTensorShape(nvcv::TensorShape inputShape, nvcv::ImageFormat outputFormat,
+nvcv::TensorShape GetOutputTensorShape(const nvcv::TensorShape &inputShape, nvcv::ImageFormat outputFormat,
                                        NVCVColorConversionCode code)
 {
     if (inputShape.rank() < 3 || inputShape.rank() > 4)
diff --git a/python/mod_cvcuda/CvtColorUtil.hpp b/python/mod_cvcuda/CvtColorUtil.hpp
index caf277375..17203d1ed 100644
--- a/python/mod_cvcuda/CvtColorUtil.hpp
+++ b/python/mod_cvcuda/CvtColorUtil.hpp
@@ -26,7 +26,7 @@ nvcv::ImageFormat GetOutputFormat(nvcv::DataType in, NVCVColorConversionCode cod
 
 int64_t GetOutputHeight(int64_t inputHeight, NVCVColorConversionCode code);
 
-nvcv::TensorShape GetOutputTensorShape(nvcv::TensorShape inputShape, nvcv::ImageFormat outputFormat,
+nvcv::TensorShape GetOutputTensorShape(const nvcv::TensorShape &inputShape, nvcv::ImageFormat outputFormat,
                                        NVCVColorConversionCode code);
 
 #endif // NVCV_COLOR_CONVERSION_UTIL_HPP
diff --git a/python/mod_nvcv/Cache.cpp b/python/mod_nvcv/Cache.cpp
index ecc854c8f..fa62b5e70 100644
--- a/python/mod_nvcv/Cache.cpp
+++ b/python/mod_nvcv/Cache.cpp
@@ -85,6 +85,7 @@ struct Cache::Impl
     std::mutex mtx;
     Items      items;
     int64_t    cache_limit_inbytes;
+    int64_t    current_size_inbytes = 0;
 };
 
 Cache::Cache()
@@ -102,12 +103,15 @@ void Cache::add(CacheItem &item)
             return;
         }
 
-        if (item.GetSizeInBytes() + doCurrentSizeInBytes() > doGetCacheLimit())
+        if (item.GetSizeInBytes() + doGetCurrentSizeInBytes() > doGetCacheLimit())
         {
-            savedItems = std::move(pimpl->items);
+            // we clear the cache: all pimpl->items will be dtor'ed at the end of scope of savedItems and cache size will be reset to 0
+            savedItems                  = std::move(pimpl->items);
+            pimpl->current_size_inbytes = 0;
         }
 
         pimpl->items.emplace(&item.key(), item.shared_from_this());
+        pimpl->current_size_inbytes += item.GetSizeInBytes();
     }
 }
 
@@ -138,6 +142,7 @@ void Cache::removeAllNotInUseMatching(const IKey &key)
             if (!it->second->isInUse())
             {
                 holdItemsUntilMtxUnlocked.push_back(it->second);
+                pimpl->current_size_inbytes -= it->second->GetSizeInBytes();
                 pimpl->items.erase(it++);
             }
             else
@@ -202,11 +207,12 @@ std::shared_ptr<CacheItem> Cache::fetchOne(const IKey &key) const
 
 void Cache::clear()
 {
-    Items                        savedItems;
-    std::unique_lock<std::mutex> lk(pimpl->mtx);
-    savedItems = std::move(pimpl->items);
-    lk.unlock();
-    savedItems.clear();
+    Items savedItems;
+    {
+        std::unique_lock<std::mutex> lk(pimpl->mtx);
+        savedItems                  = std::move(pimpl->items);
+        pimpl->current_size_inbytes = 0;
+    }
 }
 
 size_t Cache::size() const
@@ -236,9 +242,11 @@ void Cache::setCacheLimit(int64_t new_cache_limit_inbytes)
     Items savedItems;
     {
         std::unique_lock<std::mutex> lk(pimpl->mtx);
-        if (doCurrentSizeInBytes() > new_cache_limit_inbytes)
+        if (doGetCurrentSizeInBytes() > new_cache_limit_inbytes)
         {
-            savedItems = std::move(pimpl->items);
+            // we clear the cache: all pimpl->items will be dtor'ed at the end of scope of savedItems and cache size will be reset to 0
+            savedItems                  = std::move(pimpl->items);
+            pimpl->current_size_inbytes = 0;
         }
         pimpl->cache_limit_inbytes = new_cache_limit_inbytes;
     }
@@ -258,19 +266,12 @@ int64_t Cache::doGetCacheLimit() const
 int64_t Cache::getCurrentSizeInBytes()
 {
     std::unique_lock<std::mutex> lk(pimpl->mtx);
-    return doCurrentSizeInBytes();
+    return doGetCurrentSizeInBytes();
 }
 
-int64_t Cache::doCurrentSizeInBytes() const
+int64_t Cache::doGetCurrentSizeInBytes() const
 {
-    int64_t current_size_inbytes = 0;
-
-    for (auto it = pimpl->items.begin(); it != pimpl->items.end(); ++it)
-    {
-        current_size_inbytes += it->second->GetSizeInBytes();
-    }
-
-    return current_size_inbytes;
+    return pimpl->current_size_inbytes;
 }
 
 void Cache::doIterateThroughItems(const std::function<void(CacheItem &item)> &fn) const
diff --git a/python/mod_nvcv/Cache.hpp b/python/mod_nvcv/Cache.hpp
index db29a2b39..55cd60a5f 100644
--- a/python/mod_nvcv/Cache.hpp
+++ b/python/mod_nvcv/Cache.hpp
@@ -135,7 +135,7 @@ class PYBIND11_EXPORT Cache
     Cache();
 
     void    doIterateThroughItems(const std::function<void(CacheItem &item)> &fn) const;
-    int64_t doCurrentSizeInBytes() const;
+    int64_t doGetCurrentSizeInBytes() const;
     int64_t doGetCacheLimit() const;
 };
 
diff --git a/python/setup.py.in b/python/setup.py.in
index b88d9bc74..c22e9d0ff 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -66,7 +66,7 @@ setup(
         "": ["*.so", "cvcuda.libs/*.*"]
     },  # Includes the binding .so + core .so files
     include_package_data=True,
-    install_requires=["numpy>=1.23.5,<2.0.0"],
+    install_requires=["numpy>=1.23.5"],
     python_requires="==${PYTHON_VERSION}.*",
     zip_safe=False,
     cmdclass={
diff --git a/samples/scripts/run_samples.sh b/samples/scripts/run_samples.sh
index f7f17b5b2..e207fd303 100755
--- a/samples/scripts/run_samples.sh
+++ b/samples/scripts/run_samples.sh
@@ -136,3 +136,4 @@ python3 $SAMPLES_DIR/label/python/main.py -o "$LABEL_RUN_DIR"
 # Run it with batch size 1 on a single image
 LABEL_RUN_DIR=$(create_output_dir "$LABEL_OUT_DIR")
 python3 $SAMPLES_DIR/label/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg  -b 1 -o "$LABEL_RUN_DIR"
+
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c4b4001d9..a4a5696f5 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -21,5 +21,8 @@ if(LTO_ENABLED)
     set(CMAKE_INTERPROCEDURAL_OPTIMIZATION on)
 endif()
 
+# Setting NVCV lib to be a shared lib
+set(NVCV_BUILD_SHARED_LIBS ON)
+
 add_subdirectory(nvcv)
 add_subdirectory(cvcuda)
diff --git a/src/cvcuda/priv/legacy/CvCudaLegacyHelpers.cpp b/src/cvcuda/priv/legacy/CvCudaLegacyHelpers.cpp
index 6bac8da17..edcf1f0c6 100644
--- a/src/cvcuda/priv/legacy/CvCudaLegacyHelpers.cpp
+++ b/src/cvcuda/priv/legacy/CvCudaLegacyHelpers.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -284,6 +284,8 @@ NVCVStatus TranslateError(legacy::cuda_op::ErrorCode err)
 
     switch (err)
     {
+    case ErrorCode::SUCCESS:
+        return NVCV_SUCCESS;
     case ErrorCode::INVALID_PARAMETER:
     case ErrorCode::INVALID_DATA_FORMAT:
     case ErrorCode::INVALID_DATA_SHAPE:
diff --git a/src/nvcv/CMakeLists.txt b/src/nvcv/CMakeLists.txt
index 7dd2a0674..f5f544ff0 100644
--- a/src/nvcv/CMakeLists.txt
+++ b/src/nvcv/CMakeLists.txt
@@ -17,14 +17,24 @@ cmake_minimum_required(VERSION 3.20.1)
 
 project(nvcv
         LANGUAGES C CXX
-        VERSION 0.10.1
+        VERSION 0.11.0
         DESCRIPTION "NVCV is NVIDIA Computer Vision library"
 )
 
 # Used when creating special builds
 set(PROJECT_VERSION_SUFFIX "-beta")
 
-option(NVCV_ENABLE_INSTALL "Enables creation of NVCV installers using cpack" ON)
+include(CMakeDependentOption)
+
+option(NVCV_BUILD_SHARED_LIBS "Builds NVCV as shared library" ON)
+
+if(NVCV_ENABLE_INSTALL AND NOT NVCV_BUILD_SHARED_LIBS)
+    message(FATAL_ERROR "NVCV installers can only be enabled when building NVCV as a shared library")
+endif()
+
+# Enables install targets by default if building shared libs. User can disable it.
+# If building static libs, forcibly disables install targets.
+cmake_dependent_option(NVCV_ENABLE_INSTALL "Enables creation of NVCV installers using cpack" ON NVCV_BUILD_SHARED_LIBS OFF)
 
 # Configure build tree ======================
 
@@ -33,8 +43,5 @@ list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 include(ConfigVersion)
 include(ConfigBuildTree)
 
-# NVCV currently supports only shared build
-set(CMAKE_POSITION_INDEPENDENT_CODE on)
-
 add_subdirectory(util)
 add_subdirectory(src)
diff --git a/src/nvcv/cmake/ConfigBuildTree.cmake b/src/nvcv/cmake/ConfigBuildTree.cmake
index ab8082ba2..9b23c6565 100644
--- a/src/nvcv/cmake/ConfigBuildTree.cmake
+++ b/src/nvcv/cmake/ConfigBuildTree.cmake
@@ -22,6 +22,14 @@ endif()
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 
+# PIC must be enabled if building a shared lib
+if(NVCV_BUILD_SHARED_LIBS)
+    set(BUILD_SHARED_LIBS ON)
+    set(CMAKE_POSITION_INDEPENDENT_CODE on)
+else()
+    set(BUILD_SHARED_LIBS OFF)
+endif()
+
 include(GNUInstallDirs)
 
 set(CMAKE_INSTALL_LIBDIR "lib/${CMAKE_LIBRARY_ARCHITECTURE}")
diff --git a/src/nvcv/src/CMakeLists.txt b/src/nvcv/src/CMakeLists.txt
index 5253b21d3..fef54d274 100644
--- a/src/nvcv/src/CMakeLists.txt
+++ b/src/nvcv/src/CMakeLists.txt
@@ -16,7 +16,7 @@
 # nvcv_types private implementation
 add_subdirectory(priv)
 
-add_library(nvcv_types SHARED
+add_library(nvcv_types
     Config.cpp
     Version.cpp
     Status.cpp
@@ -49,8 +49,14 @@ target_include_directories(nvcv_types
 )
 
 configure_version(nvcv_types NVCV nvcv ${PROJECT_VERSION})
+
 configure_symbol_versioning(nvcv_types NVCV nvcv_types)
-setup_dso(nvcv_types ${PROJECT_VERSION})
+
+if(BUILD_SHARED_LIBS)
+    setup_dso(nvcv_types ${PROJECT_VERSION})
+else()
+    target_compile_definitions(nvcv_types PRIVATE -DNVCV_STATIC=1)
+endif()
 
 target_compile_definitions(nvcv_types PRIVATE -DNVCV_EXPORTING=1)
 
diff --git a/src/nvcv/src/include/nvcv/Export.h b/src/nvcv/src/include/nvcv/Export.h
index 611bcf4f7..53581b0fd 100644
--- a/src/nvcv/src/include/nvcv/Export.h
+++ b/src/nvcv/src/include/nvcv/Export.h
@@ -34,7 +34,11 @@
 #    endif
 #else
 #    if __GNUC__ >= 4
-#        define NVCV_PUBLIC __attribute__((visibility("default")))
+#        ifdef NVCV_STATIC
+#            define NVCV_PUBLIC __attribute__((visibility("hidden")))
+#        else
+#            define NVCV_PUBLIC __attribute__((visibility("default")))
+#        endif
 #    else
 #        define NVCV_PUBLIC
 #    endif
diff --git a/src/nvcv/util/CMakeLists.txt b/src/nvcv/util/CMakeLists.txt
index 748cc9843..1287b2ff4 100644
--- a/src/nvcv/util/CMakeLists.txt
+++ b/src/nvcv/util/CMakeLists.txt
@@ -16,7 +16,7 @@
 find_package(CUDAToolkit REQUIRED)
 
 # nvcv_util_sanitizer ---------------------------------
-add_library(nvcv_util_sanitizer
+add_library(nvcv_util_sanitizer STATIC
     SanitizerOptions.c
 )
 
@@ -176,7 +176,7 @@ target_include_directories(nvcv_util_symver
 )
 
 # nvcv_util ---------------------------------
-add_library(nvcv_util
+add_library(nvcv_util STATIC
     Assert.cpp
     CheckError.cpp
     String.cpp
diff --git a/tests/cvcuda/python/test_opmatch.py b/tests/cvcuda/python/test_opmatch.py
index 2d29fd812..81133fe93 100644
--- a/tests/cvcuda/python/test_opmatch.py
+++ b/tests/cvcuda/python/test_opmatch.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -39,9 +39,21 @@ def distance(p1, p2, norm_type):
         if norm_type == cvcuda.Norm.HAMMING:
             return sum([bin(c1 ^ c2).count("1") for c1, c2 in zip(p1, p2)])
         elif norm_type == cvcuda.Norm.L1:
-            return sum([abs(ref.absdiff(c1, c2)) for c1, c2 in zip(p1, p2)])
+            return sum(
+                [
+                    abs(ref.absdiff(c1.astype(np.float64), c2.astype(np.float64)))
+                    for c1, c2 in zip(p1, p2)
+                ]
+            )
         elif norm_type == cvcuda.Norm.L2:
-            return np.sqrt(sum([ref.absdiff(c1, c2) ** 2 for c1, c2 in zip(p1, p2)]))
+            return np.sqrt(
+                sum(
+                    [
+                        ref.absdiff(c1.astype(np.float64), c2.astype(np.float64)) ** 2
+                        for c1, c2 in zip(p1, p2)
+                    ]
+                )
+            )
 
     def brute_force_matcher(batch_set1, batch_set2, cross_check, norm_type):
         batch_matches = []
diff --git a/tests/cvcuda/system/TestOpHQResize.cpp b/tests/cvcuda/system/TestOpHQResize.cpp
index c5953b528..39e634015 100644
--- a/tests/cvcuda/system/TestOpHQResize.cpp
+++ b/tests/cvcuda/system/TestOpHQResize.cpp
@@ -20,6 +20,7 @@
 #include <common/TypedTests.hpp>
 #include <cvcuda/OpHQResize.hpp>
 #include <cvcuda/cuda_tools/DropCast.hpp>
+#include <cvcuda/cuda_tools/MathWrappers.hpp>
 #include <cvcuda/cuda_tools/StaticCast.hpp>
 #include <cvcuda/cuda_tools/TypeTraits.hpp>
 #include <nvcv/Image.hpp>
@@ -45,12 +46,29 @@ using uniform_distribution
 
 namespace baseline {
 
+template<int kSpatialNDim>
+struct Roi
+{
+    using ShapeT = cuda::MakeType<int, kSpatialNDim>;
+    ShapeT origin;
+    ShapeT shape;
+};
+
+template<int kSpatialNDim>
+Roi<kSpatialNDim> FullRoi(typename Roi<kSpatialNDim>::ShapeT shape)
+{
+    Roi<kSpatialNDim> roi;
+    roi.origin = decltype(roi.origin){0};
+    roi.shape  = shape;
+    return roi;
+}
+
 template<typename Cb>
-void ForAll(int2 shape, Cb &&cb)
+void ForAllInRoi(Roi<2> roi, Cb &&cb)
 {
-    for (int y = 0; y < shape.y; y++)
+    for (int y = roi.origin.y; y < roi.origin.y + roi.shape.y; y++)
     {
-        for (int x = 0; x < shape.x; x++)
+        for (int x = roi.origin.x; x < roi.origin.x + roi.shape.x; x++)
         {
             cb(int2{x, y});
         }
@@ -58,16 +76,18 @@ void ForAll(int2 shape, Cb &&cb)
 }
 
 template<typename Cb>
-void ForAll(int3 shape, Cb &&cb)
+void ForAllInRoi(Roi<3> roi, Cb &&cb)
 {
-    for (int z = 0; z < shape.z; z++)
-        for (int y = 0; y < shape.y; y++)
+    for (int z = roi.origin.z; z < roi.origin.z + roi.shape.z; z++)
+    {
+        for (int y = roi.origin.y; y < roi.origin.y + roi.shape.y; y++)
         {
-            for (int x = 0; x < shape.x; x++)
+            for (int x = roi.origin.x; x < roi.origin.x + roi.shape.x; x++)
             {
                 cb(int3{x, y, z});
             }
         }
+    }
 }
 
 template<typename BT, int kSpatialNDim>
@@ -88,7 +108,7 @@ struct CpuSample
 
     BT &get(int sampleIdx, const ShapeT idx, int channel)
     {
-        return *(reinterpret_cast<BT *>(m_data.data() + offset(sampleIdx, idx)) + channel);
+        return *(reinterpret_cast<BT *>(data() + offset(sampleIdx, idx)) + channel);
     }
 
     uint8_t *data()
@@ -268,7 +288,8 @@ struct Filter
 };
 
 template<typename OutBT, typename InBT, int kSpatialNDim>
-void RunNN(int axis, CpuSample<OutBT, kSpatialNDim> &outTensorCpu, CpuSample<InBT, kSpatialNDim> &inTensorCpu)
+void RunNN(int axis, CpuSample<OutBT, kSpatialNDim> &outTensorCpu, CpuSample<InBT, kSpatialNDim> &inTensorCpu,
+           Roi<kSpatialNDim> roi)
 {
     const int   numSamples  = inTensorCpu.numSamples();
     const int   numChannels = inTensorCpu.numChannels();
@@ -280,24 +301,25 @@ void RunNN(int axis, CpuSample<OutBT, kSpatialNDim> &outTensorCpu, CpuSample<InB
     const float axisOrigin  = 0.5f * axisScale;
     for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++)
     {
-        ForAll(outShape,
-               [&](const cuda::MakeType<int, kSpatialNDim> outIdx)
-               {
-                   auto inIdx                    = outIdx;
-                   int  inAxis                   = std::floor(cuda::GetElement(outIdx, axis) * axisScale + axisOrigin);
-                   inAxis                        = inAxis < 0 ? 0 : (inAxis > inSize - 1 ? inSize - 1 : inAxis);
-                   cuda::GetElement(inIdx, axis) = inAxis;
-                   for (int c = 0; c < numChannels; c++)
-                   {
-                       outTensorCpu.get(sampleIdx, outIdx, c)
-                           = cuda::SaturateCast<OutBT>(inTensorCpu.get(sampleIdx, inIdx, c));
-                   }
-               });
+        ForAllInRoi(roi,
+                    [&](const cuda::MakeType<int, kSpatialNDim> outIdx)
+                    {
+                        auto inIdx  = outIdx;
+                        int  inAxis = std::floor(cuda::GetElement(outIdx, axis) * axisScale + axisOrigin);
+                        inAxis      = inAxis < 0 ? 0 : (inAxis > inSize - 1 ? inSize - 1 : inAxis);
+                        cuda::GetElement(inIdx, axis) = inAxis;
+                        for (int c = 0; c < numChannels; c++)
+                        {
+                            outTensorCpu.get(sampleIdx, outIdx, c)
+                                = cuda::SaturateCast<OutBT>(inTensorCpu.get(sampleIdx, inIdx, c));
+                        }
+                    });
     }
 }
 
 template<typename OutBT, typename InBT, int kSpatialNDim>
-void RunLinear(int axis, CpuSample<OutBT, kSpatialNDim> &outTensorCpu, CpuSample<InBT, kSpatialNDim> &inTensorCpu)
+void RunLinear(int axis, CpuSample<OutBT, kSpatialNDim> &outTensorCpu, CpuSample<InBT, kSpatialNDim> &inTensorCpu,
+               Roi<kSpatialNDim> roi)
 {
     const int   numSamples  = inTensorCpu.numSamples();
     const int   numChannels = inTensorCpu.numChannels();
@@ -309,33 +331,33 @@ void RunLinear(int axis, CpuSample<OutBT, kSpatialNDim> &outTensorCpu, CpuSample
     const float axisOrigin  = 0.5f * axisScale - 0.5f;
     for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++)
     {
-        ForAll(outShape,
-               [&](const cuda::MakeType<int, kSpatialNDim> outIdx)
-               {
-                   const float inAxis0f           = cuda::GetElement(outIdx, axis) * axisScale + axisOrigin;
-                   int         inAxis0            = std::floor(inAxis0f);
-                   int         inAxis1            = inAxis0 + 1;
-                   const float q                  = inAxis0f - inAxis0;
-                   inAxis0                        = inAxis0 < 0 ? 0 : (inAxis0 > inSize - 1 ? inSize - 1 : inAxis0);
-                   inAxis1                        = inAxis1 < 0 ? 0 : (inAxis1 > inSize - 1 ? inSize - 1 : inAxis1);
-                   auto inIdx0                    = outIdx;
-                   auto inIdx1                    = outIdx;
-                   cuda::GetElement(inIdx0, axis) = inAxis0;
-                   cuda::GetElement(inIdx1, axis) = inAxis1;
-                   for (int c = 0; c < numChannels; c++)
-                   {
-                       const float a                          = inTensorCpu.get(sampleIdx, inIdx0, c);
-                       const float b                          = inTensorCpu.get(sampleIdx, inIdx1, c);
-                       const float tmp                        = b - a;
-                       outTensorCpu.get(sampleIdx, outIdx, c) = cuda::SaturateCast<OutBT>(std::fmaf(tmp, q, a));
-                   }
-               });
+        ForAllInRoi(roi,
+                    [&](const cuda::MakeType<int, kSpatialNDim> outIdx)
+                    {
+                        const float inAxis0f = cuda::GetElement(outIdx, axis) * axisScale + axisOrigin;
+                        int         inAxis0  = std::floor(inAxis0f);
+                        int         inAxis1  = inAxis0 + 1;
+                        const float q        = inAxis0f - inAxis0;
+                        inAxis0              = inAxis0 < 0 ? 0 : (inAxis0 > inSize - 1 ? inSize - 1 : inAxis0);
+                        inAxis1              = inAxis1 < 0 ? 0 : (inAxis1 > inSize - 1 ? inSize - 1 : inAxis1);
+                        auto inIdx0          = outIdx;
+                        auto inIdx1          = outIdx;
+                        cuda::GetElement(inIdx0, axis) = inAxis0;
+                        cuda::GetElement(inIdx1, axis) = inAxis1;
+                        for (int c = 0; c < numChannels; c++)
+                        {
+                            const float a                          = inTensorCpu.get(sampleIdx, inIdx0, c);
+                            const float b                          = inTensorCpu.get(sampleIdx, inIdx1, c);
+                            const float tmp                        = b - a;
+                            outTensorCpu.get(sampleIdx, outIdx, c) = cuda::SaturateCast<OutBT>(std::fmaf(tmp, q, a));
+                        }
+                    });
     }
 }
 
 template<typename OutBT, typename InBT, typename FilterT, int kSpatialNDim>
 void RunFilter(int axis, CpuSample<OutBT, kSpatialNDim> &outTensorCpu, CpuSample<InBT, kSpatialNDim> &inTensorCpu,
-               const FilterT &filter)
+               const FilterT &filter, Roi<kSpatialNDim> roi)
 {
     const int   numSamples    = inTensorCpu.numSamples();
     const int   numChannels   = inTensorCpu.numChannels();
@@ -350,36 +372,36 @@ void RunFilter(int axis, CpuSample<OutBT, kSpatialNDim> &outTensorCpu, CpuSample
 
     for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++)
     {
-        ForAll(outShape,
-               [&](const cuda::MakeType<int, kSpatialNDim> outIdx)
-               {
-                   const float inAxis0f = cuda::GetElement(outIdx, axis) * axisScale + axisOrigin;
-                   int         inAxis0  = std::ceil(inAxis0f);
-                   const float fStart   = (inAxis0 - inAxis0f) * filterStep;
-                   for (int c = 0; c < numChannels; c++)
-                   {
-                       float tmp  = 0;
-                       float norm = 0;
-                       for (int k = 0; k < filterSupport; k++)
-                       {
-                           int inAxis                    = inAxis0 + k;
-                           inAxis                        = inAxis < 0 ? 0 : (inAxis > inSize - 1 ? inSize - 1 : inAxis);
-                           auto inIdx                    = outIdx;
-                           cuda::GetElement(inIdx, axis) = inAxis;
-                           const InBT inVal              = inTensorCpu.get(sampleIdx, inIdx, c);
-                           float      coeff              = filter(fStart + k * filterStep);
-                           tmp                           = std::fmaf(inVal, coeff, tmp);
-                           norm += coeff;
-                       }
-                       outTensorCpu.get(sampleIdx, outIdx, c) = cuda::SaturateCast<OutBT>(tmp / norm);
-                   }
-               });
+        ForAllInRoi(roi,
+                    [&](const cuda::MakeType<int, kSpatialNDim> outIdx)
+                    {
+                        const float inAxis0f = cuda::GetElement(outIdx, axis) * axisScale + axisOrigin;
+                        int         inAxis0  = std::ceil(inAxis0f);
+                        const float fStart   = (inAxis0 - inAxis0f) * filterStep;
+                        for (int c = 0; c < numChannels; c++)
+                        {
+                            float tmp  = 0;
+                            float norm = 0;
+                            for (int k = 0; k < filterSupport; k++)
+                            {
+                                int inAxis = inAxis0 + k;
+                                inAxis     = inAxis < 0 ? 0 : (inAxis > inSize - 1 ? inSize - 1 : inAxis);
+                                auto inIdx = outIdx;
+                                cuda::GetElement(inIdx, axis) = inAxis;
+                                const InBT inVal              = inTensorCpu.get(sampleIdx, inIdx, c);
+                                float      coeff              = filter(fStart + k * filterStep);
+                                tmp                           = std::fmaf(inVal, coeff, tmp);
+                                norm += coeff;
+                            }
+                            outTensorCpu.get(sampleIdx, outIdx, c) = cuda::SaturateCast<OutBT>(tmp / norm);
+                        }
+                    });
     }
 }
 
 template<typename OutBT, typename InBT, int kSpatialNDim>
 void RunFilter(int axis, CpuSample<OutBT, kSpatialNDim> &outTensorCpu, CpuSample<InBT, kSpatialNDim> &inTensorCpu,
-               const NVCVInterpolationType interpolation, bool antialias)
+               const NVCVInterpolationType interpolation, bool antialias, Roi<kSpatialNDim> roi)
 {
     const auto  inShape  = inTensorCpu.shape();
     const auto  outShape = outTensorCpu.shape();
@@ -391,28 +413,28 @@ void RunFilter(int axis, CpuSample<OutBT, kSpatialNDim> &outTensorCpu, CpuSample
     {
         float radius  = antialias ? inSize / outSize : 1;
         float support = std::max(1.0f, 2 * radius);
-        RunFilter(axis, outTensorCpu, inTensorCpu, Filter<FilterTriangular>{support});
+        RunFilter(axis, outTensorCpu, inTensorCpu, Filter<FilterTriangular>{support}, roi);
     }
     break;
     case NVCV_INTERP_CUBIC:
     {
         float radius  = antialias ? (2 * inSize / outSize) : 2;
         float support = std::max(4.0f, 2 * radius);
-        RunFilter(axis, outTensorCpu, inTensorCpu, Filter<FilterCubic>{support});
+        RunFilter(axis, outTensorCpu, inTensorCpu, Filter<FilterCubic>{support}, roi);
     }
     break;
     case NVCV_INTERP_GAUSSIAN:
     {
         float radius  = antialias ? inSize / outSize : 1;
         float support = std::max(1.0f, 2 * radius);
-        RunFilter(axis, outTensorCpu, inTensorCpu, Filter<FilterGaussian>{support});
+        RunFilter(axis, outTensorCpu, inTensorCpu, Filter<FilterGaussian>{support}, roi);
     }
     break;
     case NVCV_INTERP_LANCZOS:
     {
         float radius  = antialias ? (3 * inSize / outSize) : 3;
         float support = std::max(6.0f, 2 * radius);
-        RunFilter(axis, outTensorCpu, inTensorCpu, Filter<FilterLanczos>{support});
+        RunFilter(axis, outTensorCpu, inTensorCpu, Filter<FilterLanczos>{support}, roi);
     }
     break;
     default:
@@ -422,7 +444,8 @@ void RunFilter(int axis, CpuSample<OutBT, kSpatialNDim> &outTensorCpu, CpuSample
 
 template<typename OutBT, typename InBT, int kSpatialNDim>
 void RunPass(int axis, CpuSample<OutBT, kSpatialNDim> &outTensorCpu, CpuSample<InBT, kSpatialNDim> &inTensorCpu,
-             const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, bool antialias)
+             const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, bool antialias,
+             Roi<kSpatialNDim> roi)
 
 {
     const auto inShape       = inTensorCpu.shape();
@@ -435,63 +458,80 @@ void RunPass(int axis, CpuSample<OutBT, kSpatialNDim> &outTensorCpu, CpuSample<I
     switch (interpolation)
     {
     case NVCV_INTERP_NEAREST:
-        RunNN(axis, outTensorCpu, inTensorCpu);
+        RunNN(axis, outTensorCpu, inTensorCpu, roi);
         break;
     case NVCV_INTERP_LINEAR:
     {
         if (antialias)
         {
-            RunFilter(axis, outTensorCpu, inTensorCpu, interpolation, antialias);
+            RunFilter(axis, outTensorCpu, inTensorCpu, interpolation, antialias, roi);
         }
         else
         {
-            RunLinear(axis, outTensorCpu, inTensorCpu);
+            RunLinear(axis, outTensorCpu, inTensorCpu, roi);
         }
     }
     break;
     default:
-        RunFilter(axis, outTensorCpu, inTensorCpu, interpolation, antialias);
+        RunFilter(axis, outTensorCpu, inTensorCpu, interpolation, antialias, roi);
         break;
     }
 }
 
 template<typename OutBT, typename InBT>
 void Resize(CpuSample<OutBT, 2> &refTensorCpu, CpuSample<InBT, 2> &inTensorCpu,
-            const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, bool antialias)
+            const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, bool antialias,
+            std::optional<Roi<2>> inRoiArg = std::nullopt, std::optional<Roi<2>> outRoiArg = std::nullopt)
 {
-    int        numSamples         = inTensorCpu.numSamples();
-    int        numChannels        = inTensorCpu.numChannels();
-    const int2 inShape            = inTensorCpu.shape();
-    const int2 outShape           = refTensorCpu.shape();
-    const int2 interShape         = {outShape.x, inShape.y};
-    auto       intermediateTensor = GetIntermediate(numSamples, interShape, numChannels);
-    RunPass(0, intermediateTensor, inTensorCpu, minInterpolation, magInterpolation, antialias);
-    RunPass(1, refTensorCpu, intermediateTensor, minInterpolation, magInterpolation, antialias);
+    int        numSamples  = inTensorCpu.numSamples();
+    int        numChannels = inTensorCpu.numChannels();
+    const int2 inShape     = inTensorCpu.shape();
+    const int2 outShape    = refTensorCpu.shape();
+    const int2 interShape  = {outShape.x, inShape.y};
+    Roi<2>     inRoi       = inRoiArg.value_or(FullRoi<2>(inShape));
+    Roi<2>     outRoi      = outRoiArg.value_or(FullRoi<2>(outShape));
+    auto       interRoi    = Roi<2>(int2(outRoi.origin.x, inRoi.origin.y), int2(outRoi.shape.x, inRoi.shape.y));
+
+    auto intermediateTensor = GetIntermediate(numSamples, interShape, numChannels);
+    RunPass(0, intermediateTensor, inTensorCpu, minInterpolation, magInterpolation, antialias, interRoi);
+    RunPass(1, refTensorCpu, intermediateTensor, minInterpolation, magInterpolation, antialias, outRoi);
 }
 
 template<typename OutBT, typename InBT>
 void Resize(CpuSample<OutBT, 3> &refTensorCpu, CpuSample<InBT, 3> &inTensorCpu,
-            const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, bool antialias)
+            const NVCVInterpolationType minInterpolation, const NVCVInterpolationType magInterpolation, bool antialias,
+            std::optional<Roi<3>> inRoiArg = std::nullopt, std::optional<Roi<3>> outRoiArg = std::nullopt)
 {
-    int        numSamples          = inTensorCpu.numSamples();
-    int        numChannels         = inTensorCpu.numChannels();
-    const int3 inShape             = inTensorCpu.shape();
-    const int3 outShape            = refTensorCpu.shape();
-    const int3 interShape0         = {outShape.x, inShape.y, inShape.z};
-    const int3 interShape1         = {outShape.x, outShape.y, inShape.z};
-    auto       intermediateTensor0 = GetIntermediate(numSamples, interShape0, numChannels);
-    RunPass(0, intermediateTensor0, inTensorCpu, minInterpolation, magInterpolation, antialias);
+    int        numSamples  = inTensorCpu.numSamples();
+    int        numChannels = inTensorCpu.numChannels();
+    const int3 inShape     = inTensorCpu.shape();
+    const int3 outShape    = refTensorCpu.shape();
+    const int3 interShape0 = {outShape.x, inShape.y, inShape.z};
+    const int3 interShape1 = {outShape.x, outShape.y, inShape.z};
+    Roi<3>     inRoi       = inRoiArg.value_or(FullRoi<3>(inShape));
+    Roi<3>     outRoi      = outRoiArg.value_or(FullRoi<3>(outShape));
+    auto       interRoi0   = Roi<3>{
+                int3{outRoi.origin.x, inRoi.origin.y, inRoi.origin.z},
+                int3{ outRoi.shape.x,  inRoi.shape.y,  inRoi.shape.z}
+    };
+    auto interRoi1 = Roi<3>(int3{outRoi.origin.x, outRoi.origin.y, inRoi.origin.z},
+                            int3{outRoi.shape.x, outRoi.shape.y, inRoi.shape.z});
+
+    auto intermediateTensor0 = GetIntermediate(numSamples, interShape0, numChannels);
+    RunPass(0, intermediateTensor0, inTensorCpu, minInterpolation, magInterpolation, antialias, interRoi0);
     auto intermediateTensor1 = GetIntermediate(numSamples, interShape1, numChannels);
-    RunPass(1, intermediateTensor1, intermediateTensor0, minInterpolation, magInterpolation, antialias);
-    RunPass(2, refTensorCpu, intermediateTensor1, minInterpolation, magInterpolation, antialias);
+    RunPass(1, intermediateTensor1, intermediateTensor0, minInterpolation, magInterpolation, antialias, interRoi1);
+    RunPass(2, refTensorCpu, intermediateTensor1, minInterpolation, magInterpolation, antialias, outRoi);
 }
 
 template<typename InBT, typename BT, int kSpatialNDim>
-void Compare(CpuSample<BT, kSpatialNDim> &tensor, CpuSample<BT, kSpatialNDim> &refTensor, bool antialias)
+void Compare(CpuSample<BT, kSpatialNDim> &tensor, CpuSample<BT, kSpatialNDim> &refTensor, bool antialias,
+             std::optional<Roi<kSpatialNDim>> roi_arg = std::nullopt)
 {
     int        numSamples  = tensor.numSamples();
     int        numChannels = tensor.numChannels();
     const auto shape       = tensor.shape();
+    const auto roi         = roi_arg.value_or(FullRoi<kSpatialNDim>(shape));
     ASSERT_EQ(numSamples, refTensor.numSamples());
     ASSERT_EQ(numChannels, refTensor.numChannels());
     ASSERT_EQ(shape, refTensor.shape());
@@ -499,33 +539,33 @@ void Compare(CpuSample<BT, kSpatialNDim> &tensor, CpuSample<BT, kSpatialNDim> &r
     int64_t vol = 0;
     for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++)
     {
-        ForAll(shape,
-               [&](const cuda::MakeType<int, kSpatialNDim> idx)
-               {
-                   for (int c = 0; c < numChannels; c++)
-                   {
-                       const BT val    = tensor.get(sampleIdx, idx, c);
-                       const BT refVal = refTensor.get(sampleIdx, idx, c);
-                       err += abs(val - refVal);
-                       vol += 1;
-
-                       if (std::is_integral_v<BT>) // uchar -> uchar, short -> short, ushort -> ushort
-                       {
-                           ASSERT_NEAR(val, refVal, (std::is_same_v<BT, uchar> ? 1 : 10)); // uchar : short, ushort
-                       }
-                       else // output type is float
-                       {
-                           if (!std::is_integral_v<InBT>) // float -> float
-                           {
-                               ASSERT_NEAR(val, refVal, 1e-4);
-                           }
-                           else // [uchar, short, ushort] -> float
-                           {
-                               ASSERT_NEAR(val, refVal, (std::is_same_v<BT, uchar> ? 0.1 : 6));
-                           }
-                       }
-                   }
-               });
+        ForAllInRoi(roi,
+                    [&](const cuda::MakeType<int, kSpatialNDim> idx)
+                    {
+                        for (int c = 0; c < numChannels; c++)
+                        {
+                            const BT val    = tensor.get(sampleIdx, idx, c);
+                            const BT refVal = refTensor.get(sampleIdx, idx, c);
+                            err += abs(val - refVal);
+                            vol += 1;
+
+                            if (std::is_integral_v<BT>) // uchar -> uchar, short -> short, ushort -> ushort
+                            {
+                                ASSERT_NEAR(val, refVal, (std::is_same_v<BT, uchar> ? 1 : 10)); // uchar : short, ushort
+                            }
+                            else // output type is float
+                            {
+                                if (!std::is_integral_v<InBT>) // float -> float
+                                {
+                                    ASSERT_NEAR(val, refVal, 1e-4);
+                                }
+                                else // [uchar, short, ushort] -> float
+                                {
+                                    ASSERT_NEAR(val, refVal, (std::is_same_v<BT, uchar> ? 0.1 : 6));
+                                }
+                            }
+                        }
+                    });
     }
     double mean_err = err / vol;
     ASSERT_LE(mean_err, antialias ? 0.1 : 0.4);
@@ -662,6 +702,18 @@ void TestTensor(bool antialias)
     nvcv::Tensor inTensor  = CreateTensorHelper(inDtype, "NHWC", numSamples, inShape.y, inShape.x, numChannels);
     nvcv::Tensor outTensor = CreateTensorHelper(outDtype, "NHWC", numSamples, outShape.y, outShape.x, numChannels);
 
+    baseline::Roi<2> inRoi  = baseline::FullRoi<2>(inShape);
+    baseline::Roi<2> outRoi = baseline::FullRoi<2>(outShape);
+    if (inShape.x * inShape.y > 1 << 23)
+    {
+        inRoi.shape  = cuda::min(inShape, int2{1 << 12, 1 << 11});
+        inRoi.origin = inShape - inRoi.shape;
+
+        double2 scale = cuda::StaticCast<double>(outShape) / cuda::StaticCast<double>(inShape);
+        outRoi.shape  = cuda::StaticCast<int>(scale * cuda::StaticCast<double>(inRoi.shape));
+        outRoi.origin = cuda::StaticCast<int>(scale * cuda::StaticCast<double>(inRoi.origin));
+    }
+
     auto inData  = inTensor.exportData<nvcv::TensorDataStridedCuda>();
     auto outData = outTensor.exportData<nvcv::TensorDataStridedCuda>();
     ASSERT_TRUE(inData && outData);
@@ -690,16 +742,14 @@ void TestTensor(bool antialias)
 
     for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++)
     {
-        for (int y = 0; y < inShape.y; y++)
-        {
-            for (int x = 0; x < inShape.x; x++)
-            {
-                for (int c = 0; c < numChannels; c++)
-                {
-                    inTensorCpu.get(sampleIdx, int2{x, y}, c) = rand(rng);
-                }
-            }
-        }
+        baseline::ForAllInRoi(inRoi,
+                              [&](int2 idx)
+                              {
+                                  for (int c = 0; c < numChannels; c++)
+                                  {
+                                      inTensorCpu.get(sampleIdx, idx, c) = rand(rng);
+                                  }
+                              });
     }
 
     cvcuda::HQResize        op;
@@ -725,9 +775,9 @@ void TestTensor(bool antialias)
     ASSERT_NO_THROW(op(stream, ws.get(), inTensor, outTensor, interpolation, interpolation, antialias));
     ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(outTensorCpu.data(), outData->basePtr(), outStrides.z * numSamples,
                                            cudaMemcpyDeviceToHost, stream));
-    baseline::Resize(refTensorCpu, inTensorCpu, interpolation, interpolation, antialias);
+    baseline::Resize(refTensorCpu, inTensorCpu, interpolation, interpolation, antialias, {inRoi}, {outRoi});
     ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
-    baseline::Compare<InBT>(outTensorCpu, refTensorCpu, antialias);
+    baseline::Compare<InBT>(outTensorCpu, refTensorCpu, antialias, {outRoi});
     ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream));
 }
 
@@ -752,7 +802,7 @@ NVCV_TYPED_TEST_SUITE(
         NVCV_TEST_ROW(3, NVCV_SHAPE3D(100, 100, 100), NVCV_SHAPE3D(100, 100, 50), 3, float, float, NVCV_INTERP_CUBIC),
         NVCV_TEST_ROW(4, NVCV_SHAPE3D(40, 40, 40), NVCV_SHAPE3D(100, 40, 40), 5, uchar, float, NVCV_INTERP_LANCZOS),
         NVCV_TEST_ROW(7, NVCV_SHAPE3D(40, 40, 40), NVCV_SHAPE3D(50, 150, 100), 3, uchar, uchar, NVCV_INTERP_CUBIC),
-        NVCV_TEST_ROW(3, NVCV_SHAPE3D(1 << 10, 1 << 9, 1 << 9), NVCV_SHAPE3D(50, 150, 100), 3, uchar, uchar,
+        NVCV_TEST_ROW(3, NVCV_SHAPE3D(1 << 10, 1 << 9, 1 << 9), NVCV_SHAPE3D(100, 150, 100), 3, uchar, uchar,
                       NVCV_INTERP_CUBIC)>);
 
 TYPED_TEST(OpHQResizeTensor3D, correct_output_with_antialias)
@@ -773,6 +823,18 @@ TYPED_TEST(OpHQResizeTensor3D, correct_output_with_antialias)
     nvcv::Tensor outTensor
         = CreateTensorHelper(outDtype, "NDHWC", numSamples, outShape.z, outShape.y, outShape.x, numChannels);
 
+    baseline::Roi<3> inRoi  = baseline::FullRoi<3>(inShape);
+    baseline::Roi<3> outRoi = baseline::FullRoi<3>(outShape);
+    if (inShape.x * inShape.y * inShape.z > 1 << 22)
+    {
+        inRoi.shape  = cuda::min(inShape, int3{1 << 8, 1 << 7, 1 << 7});
+        inRoi.origin = inShape - inRoi.shape;
+
+        double3 scale = cuda::StaticCast<double>(outShape) / cuda::StaticCast<double>(inShape);
+        outRoi.shape  = cuda::StaticCast<int>(scale * cuda::StaticCast<double>(inRoi.shape));
+        outRoi.origin = cuda::StaticCast<int>(scale * cuda::StaticCast<double>(inRoi.origin));
+    }
+
     auto inData  = inTensor.exportData<nvcv::TensorDataStridedCuda>();
     auto outData = outTensor.exportData<nvcv::TensorDataStridedCuda>();
     ASSERT_TRUE(inData && outData);
@@ -802,19 +864,14 @@ TYPED_TEST(OpHQResizeTensor3D, correct_output_with_antialias)
 
     for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++)
     {
-        for (int z = 0; z < inShape.z; z++)
-        {
-            for (int y = 0; y < inShape.y; y++)
-            {
-                for (int x = 0; x < inShape.x; x++)
-                {
-                    for (int c = 0; c < numChannels; c++)
-                    {
-                        inTensorCpu.get(sampleIdx, int3{x, y, z}, c) = rand(rng);
-                    }
-                }
-            }
-        }
+        baseline::ForAllInRoi(inRoi,
+                              [&](int3 idx)
+                              {
+                                  for (int c = 0; c < numChannels; c++)
+                                  {
+                                      inTensorCpu.get(sampleIdx, idx, c) = rand(rng);
+                                  }
+                              });
     }
 
     cvcuda::HQResize        op;
@@ -840,9 +897,9 @@ TYPED_TEST(OpHQResizeTensor3D, correct_output_with_antialias)
     ASSERT_NO_THROW(op(stream, ws.get(), inTensor, outTensor, interpolation, interpolation, antialias));
     ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(outTensorCpu.data(), outData->basePtr(), outStrides.w * numSamples,
                                            cudaMemcpyDeviceToHost, stream));
-    baseline::Resize(refTensorCpu, inTensorCpu, interpolation, interpolation, antialias);
+    baseline::Resize(refTensorCpu, inTensorCpu, interpolation, interpolation, antialias, {inRoi}, {outRoi});
     ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
-    baseline::Compare<InBT>(outTensorCpu, refTensorCpu, antialias);
+    baseline::Compare<InBT>(outTensorCpu, refTensorCpu, antialias, {outRoi});
     ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream));
 }
 
@@ -906,6 +963,28 @@ TYPED_TEST(OpHQResizeBatch, tensor_batch_2d_correct_output)
         {{512, 512}, 2, inShapes[4].numChannels}
     };
 
+    std::vector<baseline::Roi<2>> inRois(numSamples);
+    std::vector<baseline::Roi<2>> outRois(numSamples);
+
+    for (int s = 0; s < numSamples; s++)
+    {
+        int2 inShape{inShapes[s].extent[1], inShapes[s].extent[0]};
+        int2 outShape{outShapes[s].extent[1], outShapes[s].extent[0]};
+
+        inRois[s]  = baseline::FullRoi<2>(inShape);
+        outRois[s] = baseline::FullRoi<2>(outShape);
+
+        if (inShape.x * inShape.y > 1 << 23)
+        {
+            inRois[s].shape  = cuda::min(inShape, int2{1 << 12, 1 << 11});
+            inRois[s].origin = inShape - inRois[s].shape;
+
+            double2 scale     = cuda::StaticCast<double>(outShape) / cuda::StaticCast<double>(inShape);
+            outRois[s].shape  = cuda::StaticCast<int>(scale * cuda::StaticCast<double>(inRois[s].shape));
+            outRois[s].origin = cuda::StaticCast<int>(scale * cuda::StaticCast<double>(inRois[s].origin));
+        }
+    }
+
     ASSERT_EQ(numSamples, inShapes.size());
     ASSERT_EQ(numSamples, outShapes.size());
 
@@ -969,17 +1048,17 @@ TYPED_TEST(OpHQResizeBatch, tensor_batch_2d_correct_output)
         refBatchCpu.push_back(
             baseline::CpuSample<OutBT, 2>{outStrides.z, outStrides, 1, outShape, outAccess->numChannels()});
 
-        auto &inTensorCpu = inBatchCpu[sampleIdx];
-        for (int y = 0; y < inShape.y; y++)
-        {
-            for (int x = 0; x < inShape.x; x++)
-            {
-                for (int c = 0; c < inShapes[sampleIdx].numChannels; c++)
-                {
-                    inTensorCpu.get(0, int2{x, y}, c) = rand(rng);
-                }
-            }
-        }
+        const auto &inRoi       = inRois[sampleIdx];
+        auto       &inTensorCpu = inBatchCpu[sampleIdx];
+        baseline::ForAllInRoi(inRoi,
+                              [&](int2 idx)
+                              {
+                                  for (int c = 0; c < inShapes[sampleIdx].numChannels; c++)
+                                  {
+                                      inTensorCpu.get(0, idx, c) = rand(rng);
+                                  }
+                              });
+
         ASSERT_EQ(cudaSuccess,
                   cudaMemcpyAsync(inData->basePtr(), inTensorCpu.data(), inStrides.z, cudaMemcpyHostToDevice, stream));
     }
@@ -1008,8 +1087,9 @@ TYPED_TEST(OpHQResizeBatch, tensor_batch_2d_correct_output)
     for (int sampleIdx = 0; sampleIdx < numSamples; sampleIdx++)
     {
         SCOPED_TRACE(sampleIdx);
-        baseline::Resize(refBatchCpu[sampleIdx], inBatchCpu[sampleIdx], minInterpolation, magInterpolation, antialias);
-        baseline::Compare<InBT>(outBatchCpu[sampleIdx], refBatchCpu[sampleIdx], antialias);
+        baseline::Resize(refBatchCpu[sampleIdx], inBatchCpu[sampleIdx], minInterpolation, magInterpolation, antialias,
+                         {inRois[sampleIdx]}, {outRois[sampleIdx]});
+        baseline::Compare<InBT>(outBatchCpu[sampleIdx], refBatchCpu[sampleIdx], antialias, {outRois[sampleIdx]});
     }
     ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream));
 }
diff --git a/tests/cvcuda/system/TestOpOSD.cpp b/tests/cvcuda/system/TestOpOSD.cpp
index 620a8448b..3b5785562 100644
--- a/tests/cvcuda/system/TestOpOSD.cpp
+++ b/tests/cvcuda/system/TestOpOSD.cpp
@@ -419,6 +419,7 @@ NVCV_TEST_SUITE_P(OpOSD, test::ValueList<int, int, int, int, int, nvcv::ImageFor
 });
 
 // clang-format on
+
 TEST_P(OpOSD, OSD_sanity)
 {
     cudaStream_t stream;
@@ -435,6 +436,7 @@ TEST_P(OpOSD, OSD_sanity)
 }
 
 // clang-format on
+
 TEST(OpOSD, OSD_memory)
 {
     cudaStream_t stream;
@@ -452,3 +454,90 @@ TEST(OpOSD, OSD_memory)
     runOp(stream, op, inN, inW, inH, num, sed, format);
     EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream));
 }
+
+TEST(OpOSD, stb_backend)
+{
+    cudaStream_t stream;
+    ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream));
+    int               inN    = 1;
+    int               inW    = 224;
+    int               inH    = 224;
+    int               sed    = 22;
+    nvcv::ImageFormat format = nvcv::FMT_RGBA8;
+    cvcuda::OSD       op;
+    NVCVOSDType       type = NVCVOSDType::NVCV_OSD_TEXT;
+
+    srand(sed);
+
+    std::vector<std::vector<std::shared_ptr<NVCVElement>>> elementVec;
+
+    std::vector<std::string> testStrings{
+        // valid
+        "Hello!", "\u00E9", "\u20AC", "\U0001F600",
+        // invalid
+        "\xC0\x80", "\xc2\x00", // second bytes
+        "\xde\x82\xa0", "\xe2\x00\xa0", "\xe2\x82\x00", "\xe0\x9f\xa0",
+        "\xed\xbf\xa0", // three bytes
+        "\xf4\x90\x84\x9e", "\xf0\x9d\x04\x9e", "\xf0\x80\x84\x9e", "\xf4\xbf\x84\x9e", "\xf0\x9d\x84\x0e",
+        "\xf5\xc0\x84\x9e", "\xf3\xc0\x84\x9e" // four bytes
+    };
+
+    std::vector<std::shared_ptr<NVCVElement>> textVec;
+    for (auto testStr : testStrings)
+    {
+        std::shared_ptr<NVCVElement> element;
+        NVCVText                     text = NVCVText(testStr.c_str(), 5 * randl(1, 10), DEFAULT_OSD_FONT,
+                                                     NVCVPointI({randl(0, inW - 1), randl(0, inH - 1)}),
+                                                     NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
+                                                                    (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}),
+                                                     NVCVColorRGBA({(unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
+                                                                    (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)}));
+        element                           = std::make_shared<NVCVElement>(type, &text);
+        textVec.push_back(element);
+    }
+
+    elementVec.push_back(textVec);
+
+    std::shared_ptr<NVCVElementsImpl> ctx = std::make_shared<NVCVElementsImpl>(elementVec);
+
+    nvcv::Tensor imgIn  = nvcv::util::CreateTensor(inN, inW, inH, format);
+    nvcv::Tensor imgOut = nvcv::util::CreateTensor(inN, inW, inH, format);
+
+    auto input  = imgIn.exportData<nvcv::TensorDataStridedCuda>();
+    auto output = imgOut.exportData<nvcv::TensorDataStridedCuda>();
+
+    ASSERT_NE(input, nullptr);
+    ASSERT_NE(output, nullptr);
+
+    auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*input);
+    ASSERT_TRUE(inAccess);
+
+    auto outAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*output);
+    ASSERT_TRUE(outAccess);
+
+    long inSampleStride  = inAccess->numRows() * inAccess->rowStride();
+    long outSampleStride = outAccess->numRows() * outAccess->rowStride();
+
+    int inBufSize  = inSampleStride * inAccess->numSamples();
+    int outBufSize = outSampleStride * outAccess->numSamples();
+
+    EXPECT_EQ(cudaSuccess, cudaMemset(input->basePtr(), 0xFF, inSampleStride * inAccess->numSamples()));
+    EXPECT_EQ(cudaSuccess, cudaMemset(output->basePtr(), 0xFF, outSampleStride * outAccess->numSamples()));
+
+    EXPECT_NO_THROW(op(stream, imgIn, imgOut, (NVCVElements)ctx.get()));
+
+    // check cdata
+    std::vector<uint8_t> test(outBufSize);
+    std::vector<uint8_t> testIn(inBufSize);
+
+    EXPECT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+    EXPECT_EQ(cudaSuccess, cudaMemcpy(testIn.data(), input->basePtr(), inBufSize, cudaMemcpyDeviceToHost));
+    EXPECT_EQ(cudaSuccess, cudaMemcpy(test.data(), output->basePtr(), outBufSize, cudaMemcpyDeviceToHost));
+
+    std::vector<uint8_t> gold(outBufSize);
+    setGoldBuffer(gold, format, *inAccess, input->basePtr(), ctx, stream);
+
+    EXPECT_EQ(gold, test);
+
+    EXPECT_EQ(cudaSuccess, cudaStreamDestroy(stream));
+}
diff --git a/tests/nvcv_types/CMakeLists.txt b/tests/nvcv_types/CMakeLists.txt
index 3a0221a21..566f2d088 100644
--- a/tests/nvcv_types/CMakeLists.txt
+++ b/tests/nvcv_types/CMakeLists.txt
@@ -44,19 +44,27 @@ include(ExternalProject)
 
 file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../../src/nvcv DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/standalone/)
 
+set(NVCV_STANDALONE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/standalone)
+
+set(NVCV_STANDALONE_CMAKE_ARGS
+    -DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
+    -DNVCV_DIR=${CMAKE_CURRENT_BINARY_DIR}/standalone/nvcv
+    -DNVCV_ENABLE_INSTALL=OFF
+    -DEXPOSE_CODE=OFF
+    -DWARNINGS_AS_ERRORS=${WARNINGS_AS_ERRORS}
+    -DENABLE_TEGRA=${ENABLE_TEGRA}
+    -DENABLE_COMPAT_OLD_GLIBC=${ENABLE_COMPAT_OLD_GLIBC}
+)
+
 ExternalProject_Add(
     nvcv_standalone
-    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/standalone
-    BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/standalone
+    SOURCE_DIR ${NVCV_STANDALONE_SOURCE_DIR}
+    PREFIX ${CMAKE_CURRENT_BINARY_DIR}/standalone/shared
     INSTALL_COMMAND ""
     CMAKE_ARGS
-        -DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
-        -DNVCV_DIR=${CMAKE_CURRENT_BINARY_DIR}/standalone/nvcv
-        -DNVCV_ENABLE_INSTALL=OFF
-        -DEXPOSE_CODE=OFF
-        -DWARNINGS_AS_ERRORS=${WARNINGS_AS_ERRORS}
-        -DENABLE_TEGRA=${ENABLE_TEGRA}
-        -DENABLE_COMPAT_OLD_GLIBC=${ENABLE_COMPAT_OLD_GLIBC}
+        ${NVCV_STANDALONE_CMAKE_ARGS}
+        -DNVCV_BUILD_SHARED_LIBS=ON
+        -DNVCV_EXE=nvcv_test_standalone
 )
 
 add_executable(nvcv_test_standalone IMPORTED)
@@ -66,3 +74,24 @@ set_target_properties(nvcv_test_standalone PROPERTIES
 add_dependencies(nvcv_test_standalone nvcv_standalone)
 
 nvcv_add_test(${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/nvcv_test_standalone nvcv)
+
+# Test NVCV can be build statically via the standalone test
+
+ExternalProject_Add(
+    nvcv_standalone_static
+    SOURCE_DIR ${NVCV_STANDALONE_SOURCE_DIR}
+    PREFIX ${CMAKE_CURRENT_BINARY_DIR}/standalone/static
+    INSTALL_COMMAND ""
+    CMAKE_ARGS
+        ${NVCV_STANDALONE_CMAKE_ARGS}
+        -DNVCV_BUILD_SHARED_LIBS=OFF
+        -DNVCV_EXE=nvcv_test_standalone_static
+)
+
+add_executable(nvcv_test_standalone_static IMPORTED)
+set_target_properties(nvcv_test_standalone_static PROPERTIES
+    IMPORTED_LOCATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/nvcv_test_standalone_static
+)
+add_dependencies(nvcv_test_standalone_static nvcv_standalone_static)
+
+nvcv_add_test(${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/nvcv_test_standalone_static nvcv)
diff --git a/tests/nvcv_types/cudatools_unit/TestLegacyHelpers.cpp b/tests/nvcv_types/cudatools_unit/TestLegacyHelpers.cpp
index 93b350682..87cd1af4f 100644
--- a/tests/nvcv_types/cudatools_unit/TestLegacyHelpers.cpp
+++ b/tests/nvcv_types/cudatools_unit/TestLegacyHelpers.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
@@ -64,6 +64,7 @@ NVCV_TEST_SUITE_P(CheckLegacyFormatHelpersInvalid, test::ValueList<legOp::DataFo
 });
 
 // clang-format on
+
 TEST_P(CheckLegacyFormatHelpersInvalid, check_conversion_to_legacy_data_format_invalid)
 {
     int32_t numCh     = GetParamValue<1>();
@@ -87,6 +88,7 @@ NVCV_TEST_SUITE_P(CheckLegacyHelpersDataType, test::ValueList<legOp::DataType, i
 });
 
 // clang-format on
+
 TEST_P(CheckLegacyHelpersDataType, check_conversion_to_legacy_data_type)
 {
     legOp::DataType expect = GetParamValue<0>();
@@ -95,3 +97,136 @@ TEST_P(CheckLegacyHelpersDataType, check_conversion_to_legacy_data_type)
 
     EXPECT_EQ(expect, helpers::GetLegacyDataType(bpp, kind));
 }
+
+// clang-format off
+NVCV_TEST_SUITE_P(CheckLegacyHelpersDataTypeInvalid, test::ValueList<int32_t, nvcv::DataKind>
+{
+    {8, nvcv::DataKind::FLOAT},
+    {64, nvcv::DataKind::SIGNED},
+    {64, nvcv::DataKind::UNSIGNED},
+    {32, nvcv::DataKind::COMPLEX},
+    {32, nvcv::DataKind::UNSPECIFIED}
+});
+
+// clang-format on
+
+TEST_P(CheckLegacyHelpersDataTypeInvalid, check_conversion_to_legacy_data_type_invalid)
+{
+    int32_t        bpp  = GetParamValue<0>();
+    nvcv::DataKind kind = GetParamValue<1>();
+    EXPECT_THROW(helpers::GetLegacyDataType(bpp, kind), nvcv::Exception);
+}
+
+TEST(CheckLegacyHelpersDataFormat, check_image_batch_invalid_different_fmt)
+{
+    nvcv::ImageBatchVarShape imgBatch(2);
+    imgBatch.pushBack(nvcv::Image{
+        nvcv::Size2D{24, 24},
+        nvcv::FMT_NV12
+    });
+    imgBatch.pushBack(nvcv::Image{
+        nvcv::Size2D{24, 24},
+        nvcv::FMT_U8
+    });
+    EXPECT_THROW(helpers::GetLegacyDataFormat(imgBatch), nvcv::Exception);
+}
+
+// clang-format off
+NVCV_TEST_SUITE_P(CheckLegacyHelpersDataFormat, test::ValueList<legOp::DataFormat, int32_t, nvcv::ImageFormat>
+{
+    {legOp::DataFormat::kNCHW, 2, nvcv::FMT_RGB8p},
+    {legOp::DataFormat::kCHW, 1, nvcv::FMT_RGB8p},
+    {legOp::DataFormat::kNHWC, 2, nvcv::FMT_RGB8},
+    {legOp::DataFormat::kHWC, 1, nvcv::FMT_RGB8},
+});
+
+// clang-format on
+
+TEST_P(CheckLegacyHelpersDataFormat, check_image_batch_conversion)
+{
+    legOp::DataFormat expect    = GetParamValue<0>();
+    int32_t           batchSize = GetParamValue<1>();
+    nvcv::ImageFormat fmt       = GetParamValue<2>();
+
+    nvcv::ImageBatchVarShape imgBatch(batchSize);
+    for (auto i = 0; i < batchSize; ++i)
+    {
+        imgBatch.pushBack(nvcv::Image{
+            nvcv::Size2D{24, 24},
+            fmt
+        });
+    }
+
+    EXPECT_EQ(helpers::GetLegacyDataFormat(imgBatch), expect);
+}
+
+TEST_P(CheckLegacyHelpersDataFormat, check_image_batch_conversion_exported)
+{
+    legOp::DataFormat expect    = GetParamValue<0>();
+    int32_t           batchSize = GetParamValue<1>();
+    nvcv::ImageFormat fmt       = GetParamValue<2>();
+
+    cudaStream_t stream;
+    ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream));
+
+    nvcv::ImageBatchVarShape imgBatch(batchSize);
+    for (auto i = 0; i < batchSize; ++i)
+    {
+        imgBatch.pushBack(nvcv::Image{
+            nvcv::Size2D{24, 24},
+            fmt
+        });
+    }
+
+    auto exportedData = imgBatch.exportData<nvcv::ImageBatchVarShapeDataStridedCuda>(stream);
+
+    EXPECT_EQ(helpers::GetLegacyDataFormat(exportedData.value()), expect);
+
+    ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
+    ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream));
+}
+
+// clang-format off
+NVCV_TEST_SUITE_P(CheckLegacyTranslateError, test::ValueList<NVCVStatus, legOp::ErrorCode>
+{
+    {NVCV_SUCCESS, legOp::ErrorCode::SUCCESS},
+    {NVCV_ERROR_INVALID_ARGUMENT, legOp::ErrorCode::INVALID_PARAMETER},
+    {NVCV_ERROR_INVALID_ARGUMENT, legOp::ErrorCode::INVALID_DATA_FORMAT},
+    {NVCV_ERROR_INVALID_ARGUMENT, legOp::ErrorCode::INVALID_DATA_SHAPE},
+    {NVCV_ERROR_INVALID_ARGUMENT, legOp::ErrorCode::INVALID_DATA_TYPE}
+});
+
+// clang-format on
+
+TEST_P(CheckLegacyTranslateError, check_error_conversion)
+{
+    NVCVStatus       expect = GetParamValue<0>();
+    legOp::ErrorCode err    = GetParamValue<1>();
+    EXPECT_EQ(nvcv::util::TranslateError(err), expect);
+}
+
+// clang-format off
+NVCV_TEST_SUITE_P(CheckLegacyToString, test::ValueList<legOp::ErrorCode, std::string, std::string>
+{
+    {legOp::ErrorCode::SUCCESS, "SUCCESS", "Operation executed successfully"},
+    {legOp::ErrorCode::INVALID_PARAMETER, "INVALID_PARAMETER", "Some parameter is outside its acceptable range"},
+    {legOp::ErrorCode::INVALID_DATA_FORMAT, "INVALID_DATA_FORMAT", "Data format is outside its acceptable range"},
+    {legOp::ErrorCode::INVALID_DATA_SHAPE, "INVALID_DATA_SHAPE", "Tensor shape is outside its acceptable range"},
+    {legOp::ErrorCode::INVALID_DATA_TYPE, "INVALID_DATA_TYPE", "Data type is outside its acceptable range"}
+});
+
+// clang-format on
+
+TEST_P(CheckLegacyToString, check_error_to_string_conversion)
+{
+    legOp::ErrorCode err               = GetParamValue<0>();
+    std::string      expectedErrorName = GetParamValue<1>();
+    std::string      expectedDescr     = GetParamValue<2>();
+
+    char        bufferDesc[256];
+    const char *bufferDescPtr = bufferDesc;
+    const char *buffer        = nvcv::util::ToString(err, &bufferDescPtr);
+
+    EXPECT_STREQ(bufferDescPtr, expectedDescr.c_str());
+    EXPECT_STREQ(buffer, expectedErrorName.c_str());
+}
diff --git a/tests/nvcv_types/standalone/CMakeLists.txt b/tests/nvcv_types/standalone/CMakeLists.txt
index cee910fc0..8fb087cdd 100644
--- a/tests/nvcv_types/standalone/CMakeLists.txt
+++ b/tests/nvcv_types/standalone/CMakeLists.txt
@@ -29,11 +29,15 @@ endif()
 
 add_subdirectory(${NVCV_DIR} nvcv_dir)
 
-add_executable(nvcv_test_standalone
+if(NOT NVCV_EXE)
+    message(FATAL_ERROR "NVCV_EXE is empty!  NVCV executable must be given.")
+endif()
+
+add_executable(${NVCV_EXE}
     TestNVCVStandalone.cpp
 )
 
-target_link_libraries(nvcv_test_standalone
+target_link_libraries(${NVCV_EXE}
     PUBLIC
         nvcv_types
         GTest::gtest_main