neuralmagic · mgoin · Aug 28, 2023 · Aug 4, 2023 · Aug 4, 2023 · Aug 14, 2023
diff --git a/src/deepsparse/debug_analysis.py b/src/deepsparse/debug_analysis.py
@@ -141,12 +141,6 @@ def parse_args():
         type=str,
         default="",
     )
-    parser.add_argument(
-        "--disable-batch-override",
-        help="Ignores the batch_size parameter",
-        action="store_true",
-        default=False,
-    )
     parser.add_argument(
         "--use-kvcache", help="Enable KVCache", action="store_true", default=False
     )
@@ -316,10 +310,6 @@ def main():
     print("Analyzing model: {}".format(orig_model_path))
 
     batch_size = args.batch_size
-    if args.disable_batch_override:
-        batch_size = None
-        os.environ["NM_DISABLE_BATCH_OVERRIDE"] = "1"
-        print("Disable batch override: ON")
 
     if input_shapes:
         with override_onnx_input_shapes(model_path, input_shapes) as tmp_path:
@@ -357,7 +347,6 @@ def main():
         num_iterations=args.num_iterations,
         num_warmup_iterations=args.num_warmup_iterations,
         optimization_level=int(args.optimization),
-        disable_batch_override=args.disable_batch_override,
         imposed_ks=imposed_kernel_sparsity,
         input_shapes=input_shapes,
         kv_cache_params=kv_cache_params,

diff --git a/src/deepsparse/engine.py b/src/deepsparse/engine.py
@@ -17,12 +17,12 @@
 """
 
 import logging
+import os
 import time
 from enum import Enum
 from typing import Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy
-import onnx
 from tqdm.auto import tqdm
 
 from deepsparse.analytics import deepsparse_analytics as _analytics
@@ -105,9 +105,11 @@ def from_str(key: str):
             raise ValueError(f"unsupported Scheduler: {key}")
 
 
-def _validate_batch_size(batch_size: int) -> int:
-    if batch_size < 1:
-        raise ValueError("batch_size must be greater than 0")
+def _validate_batch_size(batch_size: Optional[int]) -> Optional[int]:
+    if batch_size is None or batch_size < 1:
+        _LOGGER.warn("batch_size < 1 so disabling batch size override")
+        os.environ["NM_DISABLE_BATCH_OVERRIDE"] = "1"
+        return None
 
     return batch_size
 
@@ -225,12 +227,11 @@ class BaseEngine(object):
     def construct(
         self,
         model: Union[str, "Model", "File"],
-        batch_size: int = 1,
-        num_cores: int = None,
-        num_streams: int = None,
-        scheduler: Scheduler = None,
-        input_shapes: List[List[int]] = None,
-        disable_batch_override: bool = False,
+        batch_size: Optional[int] = 1,
+        num_cores: Optional[int] = None,
+        num_streams: Optional[int] = None,
+        scheduler: Optional[Scheduler] = None,
+        input_shapes: Optional[List[List[int]]] = None,
         kv_cache_params: Optional[KVCacheParams] = None,
     ):
         _analytics.send_event("python__engine__init")
@@ -240,18 +241,16 @@ def construct(
         self._num_streams = _validate_num_streams(num_streams, self._num_cores)
         self._scheduler = _validate_scheduler(scheduler)
         self._input_shapes = input_shapes
-        self._disable_batch_override = disable_batch_override
         self._kv_cache_params = kv_cache_params
         self._cpu_avx_type = AVX_TYPE
         self._cpu_vnni = VNNI
 
     def construct_with_context(
         self,
         model: Union[str, "Model", "File"],
-        batch_size: int,
+        batch_size: Optional[int],
         context: Context,
-        input_shapes: List[List[int]] = None,
-        disable_batch_override: bool = False,
+        input_shapes: Optional[List[List[int]]] = None,
         kv_cache_params: Optional[KVCacheParams] = None,
     ):
         _analytics.send_event("python__engine__init")
@@ -261,7 +260,6 @@ def construct_with_context(
         self._num_streams = context.num_streams
         self._scheduler = _validate_scheduler(context.scheduler)
         self._input_shapes = input_shapes
-        self._disable_batch_override = disable_batch_override
         self._kv_cache_params = kv_cache_params
         self._cpu_avx_type = AVX_TYPE
         self._cpu_vnni = VNNI
@@ -297,24 +295,28 @@ class Engine(BaseEngine):
     def __init__(
         self,
         model: Union[str, "Model", "File"],
-        batch_size: int = 1,
+        batch_size: Optional[int] = 1,
         num_cores: int = None,
         num_streams: int = None,
         scheduler: Scheduler = None,
-        input_shapes: List[List[int]] = None,
-        cached_outputs: List[bool] = None,
+        input_shapes: Optional[List[List[int]]] = None,
+        cached_outputs: Optional[List[bool]] = None,
     ):
         BaseEngine.construct(
             self, model, batch_size, num_cores, num_streams, scheduler, input_shapes
         )
 
+        # self._batch_size is allowed to be None to disable setting a batch size,
+        # but the engine needs to be passed an integer. The value is abitrary and ignored
+        engine_batch_size = self._batch_size if self._batch_size else 1
+
         if self._input_shapes:
             with override_onnx_input_shapes(
                 self._model_path, self._input_shapes
             ) as model_path:
                 self._eng_net = LIB.deepsparse_engine(
                     model_path,
-                    self._batch_size,
+                    engine_batch_size,
                     self._num_cores,
                     self._num_streams,
                     self._scheduler.value,
@@ -324,14 +326,17 @@ def __init__(
         else:
             self._eng_net = LIB.deepsparse_engine(
                 self._model_path,
-                self._batch_size,
+                engine_batch_size,
                 self._num_cores,
                 self._num_streams,
                 self._scheduler.value,
                 None,
                 cached_outputs,
             )
 
+        if self._batch_size is None:
+            os.environ.pop("NM_DISABLE_BATCH_OVERRIDE", None)
+
     def __call__(
         self, inp: List[numpy.ndarray], val_inp: bool = True
     ) -> List[numpy.ndarray]:
@@ -704,14 +709,13 @@ def _validate_inputs(self, inp: List[numpy.ndarray]):
             raise ValueError("inp must be a list, given {}".format(type(inp)))
 
         for arr in inp:
-            if not self._disable_batch_override:
-                if arr.shape[0] != self._batch_size:
-                    raise ValueError(
-                        (
-                            "array batch size of {} must match the batch size "
-                            "the model was instantiated with {}"
-                        ).format(arr.shape[0], self._batch_size)
-                    )
+            if self._batch_size and arr.shape[0] != self._batch_size:
+                raise ValueError(
+                    (
+                        "array batch size of {} must match the batch size "
+                        "the model was instantiated with {}"
+                    ).format(arr.shape[0], self._batch_size)
+                )
 
             if not arr.flags["C_CONTIGUOUS"]:
                 raise ValueError(
@@ -767,14 +771,13 @@ class DebugAnalysisEngine(Engine):
     def __init__(
         self,
         model: Union[str, "Model", "File"],
-        batch_size: int = 1,
-        num_cores: int = None,
-        scheduler: Scheduler = None,
+        batch_size: Optional[int] = 1,
+        num_cores: Optional[int] = None,
+        scheduler: Optional[Scheduler] = None,
         input_shapes: List[List[int]] = None,
         num_iterations: int = 20,
         num_warmup_iterations: int = 5,
         optimization_level: int = 1,
-        disable_batch_override: bool = False,
         imposed_as: Optional[float] = None,
         imposed_ks: Optional[float] = None,
         kv_cache_params: Optional[KVCacheParams] = None,
@@ -787,12 +790,15 @@ def __init__(
             None,
             scheduler,
             input_shapes,
-            disable_batch_override,
             kv_cache_params,
         )
 
         # Helper
         def make_engine(self, model_path):
+            # self._batch_size is allowed to be None to disable setting a batch size,
+            # but the engine needs to be passed an integer. The value is abitrary and ignored
+            engine_batch_size = self._batch_size if self._batch_size else 1
+
             if self._kv_cache_params:
                 self._kv_cache = LIB.kv_cache(
                     self._kv_cache_params.prev_num_tokens,
@@ -801,7 +807,7 @@ def make_engine(self, model_path):
 
                 self._eng_net = LIB.deepsparse_engine(
                     model_path,
-                    self._batch_size,
+                    engine_batch_size,
                     self._num_cores,
                     self._num_streams,
                     self._scheduler.value,
@@ -819,7 +825,7 @@ def make_engine(self, model_path):
 
                 self._eng_net = LIB.deepsparse_engine(
                     model_path,
-                    self._batch_size,
+                    engine_batch_size,
                     self._num_cores,
                     self._num_streams,
                     self._scheduler.value,
@@ -840,6 +846,9 @@ def make_engine(self, model_path):
         else:
             make_engine(self, self._model_path)
 
+        if self._batch_size is None:
+            os.environ.pop("NM_DISABLE_BATCH_OVERRIDE", None)
+
     def analyze(
         self, inp: List[numpy.ndarray], val_inp: bool = True
     ) -> List[numpy.ndarray]:
@@ -887,22 +896,26 @@ class MultiModelEngine(Engine):
     def __init__(
         self,
         model: Union[str, "Model", "File"],
-        batch_size: int,
+        batch_size: Optional[int],
         context: Context,
-        input_shapes: List[List[int]] = None,
-        cached_outputs: List[bool] = None,
+        input_shapes: Optional[List[List[int]]] = None,
+        cached_outputs: Optional[List[bool]] = None,
     ):
         BaseEngine.construct_with_context(
             self, model, batch_size, context, input_shapes
         )
 
+        # self._batch_size is allowed to be None to disable setting a batch size,
+        # but the engine needs to be passed an integer. The value is abitrary and ignored
+        engine_batch_size = self._batch_size if self._batch_size else 1
+
         if self._input_shapes:
             with override_onnx_input_shapes(
                 self._model_path, self._input_shapes
             ) as model_path:
                 self._eng_net = LIB.deepsparse_engine(
                     model_path,
-                    self._batch_size,
+                    engine_batch_size,
                     self._num_cores,
                     self._num_streams,
                     self._scheduler.value,
@@ -912,22 +925,25 @@ def __init__(
         else:
             self._eng_net = LIB.deepsparse_engine(
                 self._model_path,
-                self._batch_size,
+                engine_batch_size,
                 self._num_cores,
                 self._num_streams,
                 self._scheduler.value,
                 context.value,
                 cached_outputs,
             )
 
+        if self._batch_size is None:
+            os.environ.pop("NM_DISABLE_BATCH_OVERRIDE", None)
+
 
 def compile_model(
     model: Union[str, "Model", "File"],
-    batch_size: int = 1,
-    num_cores: int = None,
-    num_streams: int = None,
-    scheduler: Scheduler = None,
-    input_shapes: List[List[int]] = None,
+    batch_size: Optional[int] = 1,
+    num_cores: Optional[int] = None,
+    num_streams: Optional[int] = None,
+    scheduler: Optional[Scheduler] = None,
+    input_shapes: Optional[List[List[int]]] = None,
 ) -> Engine:
     """
     Convenience function to compile a model in the DeepSparse Engine
@@ -962,16 +978,16 @@ def compile_model(
 def benchmark_model(
     model: Union[str, "Model", "File"],
     inp: List[numpy.ndarray],
-    batch_size: int = 1,
-    num_cores: int = None,
-    num_streams: int = None,
+    batch_size: Optional[int] = 1,
+    num_cores: Optional[int] = None,
+    num_streams: Optional[int] = None,
     num_iterations: int = 20,
     num_warmup_iterations: int = 5,
     include_inputs: bool = False,
     include_outputs: bool = False,
     show_progress: bool = False,
-    scheduler: Scheduler = None,
-    input_shapes: List[List[int]] = None,
+    scheduler: Optional[Scheduler] = None,
+    input_shapes: Optional[List[List[int]]] = None,
 ) -> BenchmarkResults:
     """
     Convenience function to benchmark a model in the DeepSparse Engine
@@ -1029,16 +1045,15 @@ def benchmark_model(
 def model_debug_analysis(
     model: Union[str, "Model", "File"],
     inp: List[numpy.ndarray],
-    batch_size: int = 1,
-    num_cores: int = None,
+    batch_size: Optional[int] = 1,
+    num_cores: Optional[int] = None,
     num_iterations: int = 20,
     num_warmup_iterations: int = 5,
     optimization_level: int = 1,
-    disable_batch_override: bool = False,
     imposed_as: Optional[float] = None,
     imposed_ks: Optional[float] = None,
-    scheduler: Scheduler = None,
-    input_shapes: List[List[int]] = None,
+    scheduler: Optional[Scheduler] = None,
+    input_shapes: Optional[List[List[int]]] = None,
     kv_cache_params: Optional[KVCacheParams] = None,
 ) -> dict:
     """
@@ -1054,7 +1069,8 @@ def model_debug_analysis(
         object that defines the neural network graph definition to analyze
     :param inp: The list of inputs to pass to the engine for analyzing inference.
         The expected order is the inputs order as defined in the ONNX graph.
-    :param batch_size: The batch size of the inputs to be used with the model
+    :param batch_size: The batch size of the inputs to be used with the model,
+        <1 disables it.
     :param num_cores: The number of physical cores to run the model on.
         Pass None or 0 to run on the max number of cores
         for the current machine; default None
@@ -1064,7 +1080,6 @@ def model_debug_analysis(
         before analyzing, default is 5
     :param optimization_level: The amount of graph optimizations to perform.
         The current choices are either 0 (minimal) or 1 (all), default is 1
-    :param disable_batch_override: Indicates whether disable_batch_override was used or not
     :param imposed_as: Imposed activation sparsity, defaults to None.
         Will force the activation sparsity from all ReLu layers in the graph
         to match this desired sparsity level (percentage of 0's in the tensor).
@@ -1087,7 +1102,6 @@ def model_debug_analysis(
         num_iterations=num_iterations,
         num_warmup_iterations=num_warmup_iterations,
         optimization_level=optimization_level,
-        disable_batch_override=disable_batch_override,
         imposed_as=imposed_as,
         imposed_ks=imposed_ks,
         kv_cache_params=kv_cache_params,