From 00a1fbe9332f93ea68281726bd3f391ff84bb01f Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Thu, 24 Aug 2023 15:34:18 +0200
Subject: [PATCH 1/8] fix ort training test

---
 docker/ort_training.dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/ort_training.dockerfile b/docker/ort_training.dockerfile
index 1efd2c75e..6e07c3a7c 100644
--- a/docker/ort_training.dockerfile
+++ b/docker/ort_training.dockerfile
@@ -65,5 +65,5 @@ RUN $PYTHON_EXE -m pip install --upgrade protobuf==3.20.2
 RUN $PYTHON_EXE -m torch_ort.configure
 
 # Install optimum-benchmark dependencies
-COPY gpu_requirements.txt /tmp/gpu_requirements.txt
-RUN pip install -r /tmp/gpu_requirements.txt
\ No newline at end of file
+COPY requirements.txt /tmp/requirements.txt
+RUN pip install -r /tmp/requirements.txt
\ No newline at end of file

From ee29f008bddbc542fa8a2491c5ddc85858bb7ed7 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Thu, 24 Aug 2023 15:34:50 +0200
Subject: [PATCH 2/8] renaming

---
 optimum_benchmark/{main.py => experiment.py} | 33 +++++++++-----------
 setup.py                                     |  2 +-
 2 files changed, 16 insertions(+), 19 deletions(-)
 rename optimum_benchmark/{main.py => experiment.py} (78%)

diff --git a/optimum_benchmark/main.py b/optimum_benchmark/experiment.py
similarity index 78%
rename from optimum_benchmark/main.py
rename to optimum_benchmark/experiment.py
index f961160b1..a33f1026b 100644
--- a/optimum_benchmark/main.py
+++ b/optimum_benchmark/experiment.py
@@ -1,6 +1,6 @@
 import os
 import platform
-from typing import Type, Dict
+from typing import Any, Type, Dict
 from logging import getLogger
 from dataclasses import dataclass, MISSING, field
 
@@ -20,19 +20,18 @@
     is_openvino_available,
     is_neural_compressor_available,
 )
-from optimum_benchmark.backends.base import Backend
-from optimum_benchmark.benchmarks.base import Benchmark
-from optimum_benchmark.backends.base import Backend, BackendConfig
-from optimum_benchmark.benchmarks.training import TrainingConfig
-from optimum_benchmark.benchmarks.inference import InferenceConfig
-from optimum_benchmark.benchmarks.base import Benchmark, BenchmarkConfig
-from .utils import remap_to_correct_metadata, get_cpu, get_cpu_ram_mb
+from .backends.base import Backend
+from .benchmarks.base import Benchmark
+from .utils import get_cpu, get_cpu_ram_mb
+from .benchmarks.training import TrainingConfig
+from .benchmarks.inference import InferenceConfig
 
-LOGGER = getLogger("main")
+
+LOGGER = getLogger("experiment")
 
 OmegaConf.register_new_resolver(
     "infer_task",
-    # TODO: find a better way for this; it doesn't 
+    # TODO: find a better way for this; it doesn't
     # always work because it relies on hub metadata
     lambda model, revision: TasksManager.infer_task_from_model(
         model=model,
@@ -44,10 +43,10 @@
 @dataclass
 class ExperimentConfig:
     # BACKEND CONFIGURATION
-    backend: BackendConfig = MISSING
+    backend: Any  # https://github.com/facebookresearch/hydra/issues/1722#issuecomment-883568386
 
     # BENCHMARK CONFIGURATION
-    benchmark: BenchmarkConfig = MISSING
+    benchmark: Any  # https://github.com/facebookresearch/hydra/issues/1722#issuecomment-883568386
 
     # EXPERIMENT CONFIGURATION
     experiment_name: str = MISSING
@@ -114,13 +113,11 @@ class ExperimentConfig:
 
 @hydra.main(version_base=None)
 def run_experiment(experiment: DictConfig) -> None:
-    # By default, Hydra populates the metadata object_type with the ones from ExperimentConfig but the object_type should really be
-    # one of the subclass (e.g. PyTorchBackendConfig instead of BackendConfig). This is required to call `to_object`.
-    experiment = remap_to_correct_metadata(experiment)
+    from omegaconf import SCMode
 
-    # This is required to trigger __post_init__. Reference: https://github.com/omry/omegaconf/issues/377
-    experiment = OmegaConf.to_object(experiment)
-    experiment = OmegaConf.create(experiment)
+    experiment = OmegaConf.to_container(
+        experiment, structured_config_mode=SCMode.INSTANTIATE
+    )
 
     # Save the config
     OmegaConf.save(experiment, "hydra_config.yaml", resolve=True)
diff --git a/setup.py b/setup.py
index d473a51a5..ac2095688 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
     },
     entry_points={
         "console_scripts": [
-            "optimum-benchmark=optimum_benchmark.main:run_experiment",
+            "optimum-benchmark=optimum_benchmark.experiment:run_experiment",
             "optimum-report=optimum_benchmark.report:generate_report",
         ]
     },

From 248d205e7417bf45cd7f588ae712dbcbe702e6e3 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Thu, 24 Aug 2023 15:38:59 +0200
Subject: [PATCH 3/8] major refactoring

---
 optimum_benchmark/backends/base.py            | 208 ++-----
 .../backends/neural_compressor.py             | 251 ++++----
 optimum_benchmark/backends/onnxruntime.py     | 464 ++++++++-------
 optimum_benchmark/backends/openvino.py        | 163 +++---
 optimum_benchmark/backends/pytorch.py         | 548 +++++++++---------
 optimum_benchmark/backends/utils/__init__.py  |   0
 .../backends/utils/base_utils.py              |  92 +++
 .../backends/utils/neural_compressor_utils.py |  39 ++
 .../backends/utils/onnxruntime_utils.py       |  94 +++
 .../backends/utils/openvino_utils.py          |  14 +
 .../{utils.py => utils/optimum_utils.py}      | 288 +--------
 .../backends/utils/pytorch_utils.py           |  78 +++
 optimum_benchmark/benchmarks/base.py          |  19 +-
 optimum_benchmark/benchmarks/inference.py     | 185 +++---
 .../benchmarks/inference_utils.py             |  37 ++
 optimum_benchmark/benchmarks/training.py      | 230 ++------
 .../benchmarks/training_utils.py              | 103 ++++
 .../generators/dataset_generator.py           |  11 +-
 .../generators/input_generator.py             |   8 +-
 optimum_benchmark/import_utils.py             |  10 +-
 optimum_benchmark/preprocessors/glue.py       |   1 -
 optimum_benchmark/report.py                   |  10 +-
 optimum_benchmark/trackers/latency.py         |   5 +-
 optimum_benchmark/trackers/memory.py          |  10 +-
 optimum_benchmark/utils.py                    |  76 +--
 tests/configs/base_config.yaml                |   9 -
 ...ibuted_cuda_pytorch_training_bert_ddp.yaml |   9 +-
 tests/test_cli.py                             |   2 -
 28 files changed, 1497 insertions(+), 1467 deletions(-)
 create mode 100644 optimum_benchmark/backends/utils/__init__.py
 create mode 100644 optimum_benchmark/backends/utils/base_utils.py
 create mode 100644 optimum_benchmark/backends/utils/neural_compressor_utils.py
 create mode 100644 optimum_benchmark/backends/utils/onnxruntime_utils.py
 create mode 100644 optimum_benchmark/backends/utils/openvino_utils.py
 rename optimum_benchmark/backends/{utils.py => utils/optimum_utils.py} (57%)
 create mode 100644 optimum_benchmark/backends/utils/pytorch_utils.py
 create mode 100644 optimum_benchmark/benchmarks/inference_utils.py
 create mode 100644 optimum_benchmark/benchmarks/training_utils.py

diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py
index 92c004acc..747e9ac37 100644
--- a/optimum_benchmark/backends/base.py
+++ b/optimum_benchmark/backends/base.py
@@ -1,33 +1,38 @@
-from typing import Any, Callable, Dict, List, Optional, Union
-from dataclasses import dataclass, MISSING
+from typing import Any, ClassVar, Dict, List, Optional, Union, TYPE_CHECKING
 from multiprocessing import Process
 from abc import abstractmethod, ABC
+from dataclasses import dataclass
 from logging import getLogger
-import shutil
 import os
 import gc
 
 
-import torch
-from torch import Tensor
-from datasets import Dataset
+import shutil
 from psutil import cpu_count
-from omegaconf import DictConfig
+from diffusers import DiffusionPipeline
 from optimum.exporters import TasksManager
 from transformers import (
     AutoConfig,
     AutoProcessor,
+    ProcessorMixin,
     PreTrainedModel,
-    PreTrainedTokenizer,
     PretrainedConfig,
+    PreTrainedTokenizer,
     ImageProcessingMixin,
     FeatureExtractionMixin,
-    ProcessorMixin,
-    Pipeline,
 )
 
 
-from optimum_benchmark.utils import (
+if TYPE_CHECKING:
+    from transformers.utils import ModelOutput
+    from transformers import TrainerState
+
+
+from .utils.base_utils import (
+    extract_shapes_from_diffusion_pipeline,
+    extract_shapes_from_model_artifacts,
+)
+from ..utils import (
     DIFFUSION_TASKS,
     TEXT_GENERATION_TASKS,
     check_no_process_is_running_on_cuda_device,
@@ -35,6 +40,8 @@
 )
 
 
+LOGGER = getLogger("backend")
+
 PreTrainedProcessor = Union[
     PreTrainedTokenizer,
     ImageProcessingMixin,
@@ -42,14 +49,12 @@
     ProcessorMixin,
 ]
 
-LOGGER = getLogger("backend")
-
 
 @dataclass
 class BackendConfig(ABC):
-    name: str = MISSING
-    version: str = MISSING
-    _target_: str = MISSING
+    name: str
+    version: str
+    _target_: str
 
     # backend options
     inter_op_num_threads: Optional[int] = None
@@ -62,18 +67,28 @@ class BackendConfig(ABC):
     # clean up options
     delete_cache: bool = False
 
+    def __post_init__(self):
+        if self.inter_op_num_threads is not None:
+            if self.inter_op_num_threads == -1:
+                self.inter_op_num_threads = cpu_count()
+
+        if self.intra_op_num_threads is not None:
+            if self.intra_op_num_threads == -1:
+                self.intra_op_num_threads = cpu_count()
+
 
 class Backend(ABC):
-    # model and pipeline benchmarks
-    pretrained_model: Union[PreTrainedModel, Pipeline]
-    # only for model benchmarks
-    pretrained_config: Optional[PretrainedConfig]
+    name: str
+    config: ClassVar[BackendConfig]
+
+    pretrained_model: Union[PreTrainedModel, DiffusionPipeline]
     pretrained_processor: Optional[PreTrainedProcessor]
+    pretrained_config: Optional[PretrainedConfig]
 
-    def __init__(self, model: str, task: str, device: str, hub_kwargs: DictConfig):
+    def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any]):
         self.model = model
         self.task = task
-        self.device = torch.device(device)
+        self.device = device
         self.hub_kwargs = hub_kwargs
 
         if self.is_diffusion_pipeline():
@@ -97,7 +112,7 @@ def __init__(self, model: str, task: str, device: str, hub_kwargs: DictConfig):
                     **self.hub_kwargs,
                 )
             except ValueError:
-                LOGGER.warning(f"Could not find the model's preprocessor")
+                LOGGER.warning("Could not find the model's preprocessor")
                 self.pretrained_processor = None
 
         # we're using this one as the default model_class which is used
@@ -119,7 +134,9 @@ def check_initial_isolation(self) -> None:
             cuda_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
             if cuda_devices is None:
                 LOGGER.warning(
-                    "Asked to check the initial device isolation, but the variable CUDA_VISIBLE_DEVICES was not set. Defaulting to checking on the first device."
+                    "Asked to check the initial device isolation, "
+                    "but the variable CUDA_VISIBLE_DEVICES was not set. "
+                    "Defaulting to checking on the first device."
                 )
                 device_ids = {self.device.index if self.device.index is not None else 0}
             else:
@@ -133,7 +150,9 @@ def check_continuous_isolation(self) -> None:
             cuda_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
             if cuda_devices is None:
                 LOGGER.warning(
-                    "Asked to check the continuous device isolation, but the variable CUDA_VISIBLE_DEVICES was not set. Defaulting to checking on the first device."
+                    "Asked to check the continuous device isolation, "
+                    "but the variable CUDA_VISIBLE_DEVICES was not set. "
+                    "Defaulting to checking on the first device."
                 )
                 device_ids = {self.device.index if self.device.index is not None else 0}
             else:
@@ -150,75 +169,48 @@ def check_continuous_isolation(self) -> None:
 
     @abstractmethod
     def configure(self, config: BackendConfig) -> None:
-        self.config = config
-
         LOGGER.info(f"Configuring {config.name} backend")
-
         self.config = config
-        if config.inter_op_num_threads is not None:
-            if config.inter_op_num_threads == -1:
-                config.inter_op_num_threads = cpu_count()
-                LOGGER.info(
-                    f"\t+ Setting backend.inter_op_num_threads to cpu_count({config.inter_op_num_threads})"
-                )
-
-        if config.intra_op_num_threads is not None:
-            if config.intra_op_num_threads == -1:
-                config.intra_op_num_threads = cpu_count()
-                LOGGER.info(
-                    f"\t+ Setting backend.intra_op_num_threads to cpu_count({config.intra_op_num_threads})"
-                )
-
-        # clean up options
-        if config.delete_cache:
-            LOGGER.info("\t+ Will delete model cache after benchmarking")
-        self.delete_cache = config.delete_cache
 
         # isolation options
-        if config.initial_isolation_check:
+        if self.config.initial_isolation_check:
             LOGGER.info("\t+ Checking initial device isolation")
             self.check_initial_isolation()
-        if config.continous_isolation_check:
+        if self.config.continous_isolation_check:
             LOGGER.info("\t+ Checking contineous device isolation")
             self.check_continuous_isolation()
 
+        # clean up options
+        if self.config.delete_cache:
+            LOGGER.info("\t+ Model cache will be deleted after benchmark")
+
     # compiling in openvino requires input shapes
-    def prepare_for_inference(self, input_shapes: Dict[str, int]) -> None:
+    def prepare_for_inference(self, input_shapes: Dict[str, int]) -> Dict[str, Any]:
         pass
 
     # symbolic tracing in transformers requires input names
-    def prepare_for_profiling(self, input_names: List[str]) -> None:
-        pass
-
-    # depending on the backend, we might need to prepare the model for training
-    # in different ways although I prefer to pass these in the train method
-    def prepare_for_training(
-        self,
-        training_dataset: Dataset,
-        training_data_collator: Callable,
-        training_arguments: Dict[str, Any],
-    ) -> None:
+    def prepare_for_profiling(self, input_names: List[str]) -> Dict[str, Any]:
         pass
 
-    def forward(self, input: Dict[str, Tensor], **kwargs):
+    def forward(self, input: Dict[str, Any], **kwargs) -> "ModelOutput":
         raise NotImplementedError("Backend must implement forward method")
 
-    def generate(self, input: Dict[str, Tensor], **kwargs):
+    def generate(self, input: Dict[str, Any], **kwargs) -> "ModelOutput":
         raise NotImplementedError("Backend must implement generate method")
 
-    def train(self):
+    def train(self) -> "TrainerState":
         raise NotImplementedError("Backend must implement train method")
 
     def delete_pretrained_model(self) -> None:
-        if hasattr(self, "pretrained_model"):
+        try:
             del self.pretrained_model
-            gc.collect()
+        except AttributeError:
+            # benchmark might fail before the model is loaded
+            pass
 
-        if self.device.type == "cuda":
-            torch.cuda.empty_cache()
-            gc.collect()
+        gc.collect()
 
-    def delete_model_hub_cache(self) -> None:
+    def delete_model_cache(self) -> None:
         model_cache_path = "models--" + self.model.replace("/", "--")
         model_cache_path = os.path.join(
             os.path.expanduser("~/.cache/huggingface/hub"), model_cache_path
@@ -226,11 +218,11 @@ def delete_model_hub_cache(self) -> None:
         shutil.rmtree(model_cache_path, ignore_errors=True)
 
     def clean(self) -> None:
-        LOGGER.info(f"Cleaning backend")
+        LOGGER.info(f"Cleaning {self.config.name} backend")
         self.delete_pretrained_model()
 
-        if self.delete_cache:
-            self.delete_model_hub_cache()
+        if self.config.delete_cache:
+            self.delete_model_cache()
 
     @property
     def model_shapes(self) -> Dict[str, int]:
@@ -245,75 +237,3 @@ def model_shapes(self) -> Dict[str, int]:
             )
 
         return model_shapes
-
-
-def extract_shapes_from_diffusion_pipeline(
-    pipeline: Pipeline,
-) -> Dict[str, Any]:
-    # this is the only way I found to extract a
-    # diffusion pipeline's "output" shapes
-    shapes = {}
-    try:
-        shapes["num_channels"] = pipeline.vae_encoder.config.out_channels
-        shapes["height"] = pipeline.vae_encoder.config.sample_size
-        shapes["width"] = pipeline.vae_encoder.config.sample_size
-    except AttributeError:
-        LOGGER.warning("Could not find the diffusion pipeline's output shapes")
-        shapes["num_channels"] = -1
-        shapes["height"] = -1
-        shapes["width"] = -1
-
-    return shapes
-
-
-def extract_shapes_from_model_artifacts(
-    config: PretrainedConfig,
-    processor: Optional[PreTrainedProcessor] = None,
-) -> Dict[str, Any]:
-    shapes = {}
-    artifacts_dict = {}
-
-    config_dict = {k: v for k, v in config.to_dict().items() if v is not None}
-    artifacts_dict.update(config_dict)
-
-    if processor is not None and hasattr(processor, "to_dict"):
-        processor_dict = {k: v for k, v in processor.to_dict().items() if v is not None}
-        artifacts_dict.update(processor_dict)
-
-    # text input
-    shapes["vocab_size"] = artifacts_dict.get("vocab_size", 2)
-    shapes["type_vocab_size"] = artifacts_dict.get("type_vocab_size", 2)
-
-    # image input
-    shapes["num_channels"] = artifacts_dict.get("num_channels", None)
-
-    image_size = artifacts_dict.get("image_size", None)
-    if image_size is None:
-        # processors have different names for the image size
-        image_size = artifacts_dict.get("size", None)
-
-    if isinstance(image_size, (int, float)):
-        shapes["height"] = image_size
-        shapes["width"] = image_size
-    elif isinstance(image_size, (list, tuple)):
-        shapes["height"] = image_size[0]
-        shapes["width"] = image_size[0]
-    elif type(image_size) == dict and len(image_size) == 2:
-        shapes["height"] = list(image_size.values())[0]
-        shapes["width"] = list(image_size.values())[1]
-    elif type(image_size) == dict and len(image_size) == 1:
-        shapes["height"] = list(image_size.values())[0]
-        shapes["width"] = list(image_size.values())[0]
-    else:
-        shapes["height"] = None
-        shapes["width"] = None
-
-    # classification labels (default to 2)
-    shapes["num_labels"] = len(
-        artifacts_dict.get("id2label", {"0": "LABEL_0", "1": "LABEL_1"})
-    )
-
-    # object detection labels (default to 2)
-    shapes["num_queries"] = artifacts_dict.get("num_queries", 2)
-
-    return shapes
diff --git a/optimum_benchmark/backends/neural_compressor.py b/optimum_benchmark/backends/neural_compressor.py
index a22aa5d1e..a1ac95f73 100644
--- a/optimum_benchmark/backends/neural_compressor.py
+++ b/optimum_benchmark/backends/neural_compressor.py
@@ -1,27 +1,35 @@
-from typing import Dict
-from torch import Tensor
+from typing import Dict, Optional, Any, TYPE_CHECKING
+from tempfile import TemporaryDirectory
+from dataclasses import dataclass
 from logging import getLogger
+
+import torch
+from torch import Tensor
 from hydra.utils import get_class
-from dataclasses import dataclass, field
-from tempfile import TemporaryDirectory
 from omegaconf import DictConfig, OmegaConf
+from optimum.intel.neural_compressor.quantization import INCQuantizer
+from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS
+from neural_compressor import __version__ as neural_compressor_version
+from neural_compressor.config import (
+    AccuracyCriterion,
+    TuningCriterion,
+    PostTrainingQuantConfig,
+)
 
-try:
-    from neural_compressor import __version__ as neural_compressor_version
-except ImportError:
-    neural_compressor_version = "Not installed"
-
-from optimum_benchmark.backends.base import Backend, BackendConfig
-
+if TYPE_CHECKING:
+    from transformers.utils import ModelOutput
 
-OmegaConf.register_new_resolver(
-    "ptq_is_static",
-    lambda approach: approach == "static",
+from .base import Backend, BackendConfig
+from .utils.neural_compressor_utils import (
+    DEFAULT_QUANTIZATION_CONFIG,
+    DEFAULT_CALIBRATION_CONFIG,
 )
 
 
 LOGGER = getLogger("neural_compressor")
 
+OmegaConf.register_new_resolver("ptq_is_static", lambda approach: approach == "static")
+
 
 @dataclass
 class INCConfig(BackendConfig):
@@ -34,67 +42,52 @@ class INCConfig(BackendConfig):
 
     # quantization options
     quantization: bool = False
-    quantization_config: Dict = field(default_factory=lambda: {
-            "device": "cpu",
-            "backend": "default",
-            "domain": "auto",
-            "recipes": {},
-            "quant_format": "default",
-            "inputs": [],
-            "outputs": [],
-            "approach": "static",
-            "calibration_sampling_size": [100],
-            "op_type_dict": None,
-            "op_name_dict": None,
-            "reduce_range": None,
-            "example_inputs": None,
-            "excluded_precisions": [],
-            "quant_level": "auto",
-            "accuracy_criterion": DictConfig(
-                {
-                    "higher_is_better": True,
-                    "criterion": "relative",
-                    "tolerable_loss": 0.01,
-                }
-            ),
-            "tuning_criterion": DictConfig(
-                {
-                    "strategy": "basic",
-                    "strategy_kwargs": None,
-                    "timeout": 0,
-                    "max_trials": 100,
-                    "objective": "performance",
-                }
-            ),
-            "diagnosis": False,
-        }
-    )
+    quantization_config: Optional[Dict[str, Any]] = None
 
     # calibration options
-    calibration: bool = "${ptq_is_static:${backend.quantization_config.approach}}"  # type: ignore
-    calibration_config: Dict = field(default_factory=lambda: {
-            "dataset_name": "glue",
-            "num_samples": 300,
-            "dataset_config_name": "sst2",
-            "dataset_split": "train",
-            "preprocess_batch": True,
-            "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor",
-        }
-    )
+    calibration: bool = False
+    calibration_config: Optional[Dict[str, Any]] = None
+
+    def __post_init__(self):
+        if self.no_weights:
+            # TODO: implement no_weights for neural_compressor backend if possible
+            raise NotImplementedError(
+                "no_weights is not supported for neural_compressor backend"
+            )
+
+        if self.quantization:
+            self.quantization_config = OmegaConf.merge(
+                self.quantization_config if self.quantization_config else {},
+                DEFAULT_QUANTIZATION_CONFIG,
+            )
+            if self.calibration_config["approach"] == "static":
+                self.calibration = True
+
+        if self.calibration:
+            self.calibration_config = OmegaConf.merge(
+                self.calibration_config if self.calibration_config else {},
+                DEFAULT_CALIBRATION_CONFIG,
+            )
 
 
 class INCBackend(Backend):
+    name: str = "neural_compressor"
+    config: INCConfig
+
     def __init__(
         self, model: str, task: str, device: str, hub_kwargs: DictConfig
     ) -> None:
         super().__init__(model, task, device, hub_kwargs)
+        self.device = torch.device(device)
 
-        from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS
+        assert self.task in _HEAD_TO_AUTOMODELS, (
+            f"INCBackend does not support task {self.task} yet. "
+            f"Supported tasks are: {list(_HEAD_TO_AUTOMODELS.keys())}"
+        )
 
         self.incmodel_class = get_class(
             f"optimum.intel.neural_compressor.{_HEAD_TO_AUTOMODELS[self.task]}"
         )
-
         LOGGER.info(
             f"\t+ Infered INCModel class {self.incmodel_class.__name__} "
             f"for task {self.task} and model_type {self.model_type}"
@@ -103,84 +96,100 @@ def __init__(
     def configure(self, config: INCConfig) -> None:
         super().configure(config)
 
-        with TemporaryDirectory() as tmpdirname:
-            if config.no_weights:
-                raise NotImplementedError(
-                    "no_weights is not supported for neural_compressor backend"
-                )
-            else:
-                self.load_model_from_pretrained(config)
-
-            if config.quantization:
-                self.quantize_model(config, tmpdirname)
-
-    def load_model_from_pretrained(self, config: INCConfig) -> None:
-        self.pretrained_model = self.incmodel_class.from_pretrained(
-            # something is wrong here, modeling is not consistent
-            model_name_or_path=self.model,
-            # for some reason only causalLM expects model_id instead of model_name_or_path
-            **({"model_id": self.model} if self.task == "text-generation" else {}),
-            device_map=self.device,
-            **self.hub_kwargs,
-        )
-
-    def quantize_model(self, config: INCConfig, tmpdirname: str) -> None:
-        from optimum.intel.neural_compressor.quantization import INCQuantizer
-        from neural_compressor.config import (
-            AccuracyCriterion,
-            TuningCriterion,
-            PostTrainingQuantConfig,
-        )
+        if self.config.quantization:
+            self.config.quantization_config["accuracy_criterion"] = AccuracyCriterion(
+                **self.config.quantization_config["accuracy_criterion"]
+            )
+            self.config.quantization_config["tuning_criterion"] = TuningCriterion(
+                **self.config.quantization_config["tuning_criterion"]
+            )
+            self.quantization_config = PostTrainingQuantConfig(
+                **self.config.quantization_config
+            )
 
-        LOGGER.info("\t+ Attempting quantization")
+        if self.config.calibration:
+            self.config.calibration_config["preprocess_class"] = get_class(
+                self.config.calibration_config["preprocess_class"]
+            )
+            self.config.calibration_config[
+                "preprocess_function"
+            ] = self.config.calibration_config["preprocess_class"](
+                model_name_or_path=self.model
+            )
+            self.config.calibration_config.pop("preprocess_class")
 
-        quantization_config = OmegaConf.to_container(config.quantization_config)
-        quantization_config["accuracy_criterion"] = AccuracyCriterion(
-            **config.quantization_config.accuracy_criterion
-        )
-        quantization_config["tuning_criterion"] = TuningCriterion(
-            **config.quantization_config.tuning_criterion
-        )
-        quantization_config = PostTrainingQuantConfig(**quantization_config)
+        with TemporaryDirectory() as tmpdirname:
+            if self.config.quantization:
+                self.load_and_quantize_automodel(tmpdirname)
+            else:
+                self.load_incmodel()
 
+    def load_and_quantize_automodel(self, tmpdirname: str) -> None:
+        LOGGER.info("\t+ Loading pretrained AutoModel")
         model = self.automodel_class.from_pretrained(self.model, **self.hub_kwargs)
-        quantizer = INCQuantizer.from_pretrained(model, task=self.task)
-
-        if config.calibration:
-            preprocess_class = get_class(config.calibration_config.preprocess_class)
-            preprocess_function = preprocess_class(model_name_or_path=self.model)
+        LOGGER.info("\t+ Creating quantizer")
+        quantizer = INCQuantizer.from_pretrained(
+            model,
+            eval_fn=None,
+            calibration_fn=None,
+            task=self.task,
+        )
 
+        if self.config.calibration:
+            LOGGER.info("\t+ Loading calibration dataset")
             calibration_dataset = quantizer.get_calibration_dataset(
-                dataset_name=config.calibration_config.dataset_name,
-                num_samples=config.calibration_config.num_samples,
-                dataset_config_name=config.calibration_config.dataset_config_name,
-                dataset_split=config.calibration_config.dataset_split,
-                preprocess_function=preprocess_function,
+                **self.config.calibration_config
             )
+        else:
+            calibration_dataset = None
 
+        LOGGER.info("\t+ Attempting quantization")
         quantizer.quantize(
-            save_onnx_model=False,
-            quantization_config=quantization_config,
-            calibration_dataset=calibration_dataset,
+            quantization_config=self.config.quantization_config,
             save_directory=f"{tmpdirname}/quantized",
+            calibration_dataset=calibration_dataset,
+            # default values
+            batch_size=8,
+            data_collator=None,
+            remove_unused_columns=True,
+            file_name=None,
         )
 
-        self.delete_pretrained_model()
-
-        LOGGER.info("\t+ Loading quantized model")
+        LOGGER.info("\t+ Loading quantized INCModel")
         self.pretrained_model = self.incmodel_class.from_pretrained(
             model_name_or_path=f"{tmpdirname}/quantized",
         )
 
-    def forward(self, input: Dict[str, Tensor], **kwargs) -> Tensor:
-        output = self.pretrained_model(**input, **kwargs)[0]
+    def load_incmodel(self) -> None:
+        if self.is_diffusion_pipeline():
+            self.pretrained_model = self.incmodel_class.from_pretrained(
+                model_name_or_path=self.model,
+                **self.hub_kwargs,
+            )
+            self.pretrained_model.to(self.device)
+        elif self.is_text_generation_model():
+            self.pretrained_model = self.incmodel_class.from_pretrained(
+                # for some reason only causalLM expects 
+                # model_id instead of model_name_or_path
+                model_id=self.model,
+                device_map=self.device,
+                **self.hub_kwargs,
+            )
+        else:
+            self.pretrained_model = self.incmodel_class.from_pretrained(
+                # for some reason only causalLM expects 
+                # model_id instead of model_name_or_path
+                model_name_or_path=self.model,
+                device_map=self.device,
+                **self.hub_kwargs,
+            )
+
+    def forward(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput":
+        output = self.pretrained_model(**input, **kwargs)
 
         return output
 
-    def generate(self, input: Dict[str, Tensor], **kwargs) -> Tensor:
-        output = self.pretrained_model.generate(**input, **kwargs)[0]
+    def generate(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput":
+        output = self.pretrained_model.generate(**input, **kwargs)
 
         return output
-
-    def train(self) -> None:
-        pass
diff --git a/optimum_benchmark/backends/onnxruntime.py b/optimum_benchmark/backends/onnxruntime.py
index 886fa6e6b..57e811706 100644
--- a/optimum_benchmark/backends/onnxruntime.py
+++ b/optimum_benchmark/backends/onnxruntime.py
@@ -1,22 +1,19 @@
+from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING
+from tempfile import TemporaryDirectory
+from dataclasses import dataclass
+from logging import getLogger
+from datasets import Dataset
 import os
+
+
 import torch
 from torch import Tensor
-from datasets import Dataset
-from logging import getLogger
 from omegaconf import OmegaConf
-from dataclasses import dataclass, field
 from hydra.utils import get_class
-from tempfile import TemporaryDirectory
-from omegaconf.dictconfig import DictConfig
-from typing import Any, Callable, Dict, List, Optional
-
-
-try:
-    from onnxruntime import __version__ as onnxruntime_version
-except ImportError:
-    onnxruntime_version = "Not installed"
-
-from optimum.onnxruntime import ORTOptimizer, ORTQuantizer
+from onnxruntime import SessionOptions
+from accelerate import init_empty_weights
+from optimum.pipelines import ORT_SUPPORTED_TASKS
+from onnxruntime import __version__ as onnxruntime_version
 from optimum.onnxruntime.configuration import (
     OptimizationConfig,
     QuantizationConfig,
@@ -24,16 +21,34 @@
     AutoOptimizationConfig,
     AutoQuantizationConfig,
 )
+from optimum.onnxruntime import (
+    ORTOptimizer,
+    ORTQuantizer,
+    ORTTrainer,
+    ORTTrainingArguments,
+)
 
+if TYPE_CHECKING:
+    from transformers import TrainerCallback, TrainerState
+    from transformers.modeling_outputs import ModelOutput
+
+
+from .base import Backend, BackendConfig
+from .utils.optimum_utils import main_export
+from .utils.pytorch_utils import randomize_weights
+from ..profilers.ort_profiler import ORTProfilingWrapper
+from .utils.onnxruntime_utils import (
+    format_ort_quantization_dict,
+    infer_device_id,
+    DEFAULT_OPTIMIZATION_CONFIG,
+    DEFAULT_QUANTIZATION_CONFIG,
+    DEFAULT_CALIBRATION_CONFIG,
+)
 
-from optimum_benchmark.backends.base import Backend, BackendConfig
-from optimum_benchmark.backends.utils import main_export, randomize_weights
-from optimum_benchmark.profilers.ort_profiler import ORTProfilingWrapper
-from optimum_benchmark.utils import infer_device_id
 
 OmegaConf.register_new_resolver(
     "is_gpu",
-    lambda device: torch.device(device).type == "cuda",
+    lambda device: "cuda" in device.lower() or "tensorrt" in device.lower(),
 )
 OmegaConf.register_new_resolver(
     "is_profiling",
@@ -47,10 +62,6 @@
     "infer_device_id",
     lambda device: infer_device_id(device),
 )
-OmegaConf.register_new_resolver(
-    "requires_calibration",
-    lambda *static_quants: any(static_quants),
-)
 
 LOGGER = getLogger("onnxruntime")
 
@@ -70,103 +81,109 @@ class ORTConfig(BackendConfig):
 
     # provider options
     provider: str = "${infer_provider:${device}}"
+    provider_options: Optional[Dict] = None
+    # TODO: deprecate device_id in favor of provider_options
     device_id: Optional[int] = "${infer_device_id:${device}}"
 
     # inference options
     use_io_binding: bool = "${is_gpu:${device}}"
+    session_options: Optional[Dict] = None
+    # TODO: deprecate enable_profiling in favor of session_options
     enable_profiling: bool = "${is_profiling:${benchmark.name}}"
 
     # optimization options
     optimization: bool = False
-    optimization_config: Dict = field(
-        default_factory=lambda: {
-            "optimization_level": 1,  # 0, 1, 2, 99
-            "optimize_for_gpu": "${is_gpu:${device}}",
-            "fp16": False,
-            "enable_transformers_specific_optimizations": True,
-            "enable_gelu_approximation": False,
-            "disable_gelu_fusion": False,
-            "disable_layer_norm_fusion": False,
-            "disable_attention_fusion": False,
-            "disable_skip_layer_norm_fusion": True,
-            "disable_bias_skip_layer_norm_fusion": False,
-            "disable_bias_gelu_fusion": False,
-            "use_mask_index": False,
-            "no_attention_mask": False,
-            "disable_embed_layer_norm_fusion": True,
-            "disable_shape_inference": False,
-            "use_multi_head_attention": False,
-            "enable_gemm_fast_gelu_fusion": False,
-            "use_raw_attention_mask": False,
-            "disable_group_norm_fusion": True,
-            "disable_packed_kv": True,
-        }
-    )
+    optimization_config: Optional[Dict] = None
 
     # O1, O2, O3, O4
     auto_optimization: Optional[str] = None
-    auto_optimization_config: Dict = field(
-        default_factory=lambda: {
-            "for_gpu": "${is_gpu:${device}}",
-            # add auto optimization specific options in config file or cli
-            # using +backend.auto_optimization_config.option_name: value
-        }
-    )
+    auto_optimization_config: Optional[Dict] = None
 
     # quantization options
     quantization: bool = False
-    quantization_config: Dict = field(
-        default_factory=lambda: {
-            "is_static": False,
-            "format": "QOperator",  # QOperator, QDQ
-            "mode": "IntegerOps",  # QLinearOps, IntegerOps
-            "activations_dtype": "QUInt8",  # QInt8, QUInt8
-            "activations_symmetric": False,
-            "weights_dtype": "QInt8",  # QInt8, QUInt8
-            "weights_symmetric": True,
-            "per_channel": False,
-            "reduce_range": False,
-            "operators_to_quantize": [
-                "MatMul",
-                "Add",
-            ],
-        }
-    )
+    quantization_config: Optional[Dict] = None
 
     # arm64,avx2,avx512,avx512_vnni,tensorrt
     auto_quantization: Optional[str] = None
-    auto_quantization_config: Dict = field(
-        default_factory=lambda: {
-            "is_static": False
-            # add auto quantization specific options in config file or cli
-            # using +backend.auto_quantization_config.option_name: value
-        }
-    )
+    auto_quantization_config: Optional[Dict] = None
 
     # calibration options
-    calibration: bool = "${requires_calibration:${backend.auto_quantization_config.is_static}, ${backend.quantization_config.is_static}}"
-    calibration_config: Dict = field(
-        default_factory=lambda: {
-            "dataset_name": "glue",
-            "num_samples": 300,
-            "dataset_config_name": "sst2",
-            "dataset_split": "train",
-            "preprocess_batch": True,
-            "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor",
-        }
-    )
-
-    # this will skip exporting the model and will use automodel instead
+    calibration: bool = False
+    calibration_config: Optional[Dict] = None
+
+    # this will skip exporting the model and will use automodel with trainer
     use_ortmodel: bool = "${is_inference:${benchmark.name}}"
 
+    def __post_init__(self):
+        if self.optimization:
+            self.optimization_config = OmegaConf.merge(
+                self.optimization_config or {},
+                DEFAULT_OPTIMIZATION_CONFIG,
+            )
+
+        if self.auto_optimization is not None:
+            self.auto_optimization_config = OmegaConf.merge(
+                self.auto_optimization_config or {},
+                DEFAULT_OPTIMIZATION_CONFIG,
+            )
+            self.auto_optimization_config.pop("optimization_level", None)
+            self.auto_optimization_config[
+                "for_gpu"
+            ] = self.auto_optimization_config.pop("optimize_for_gpu")
+
+        if self.quantization:
+            self.quantization_config = OmegaConf.merge(
+                self.quantization_config or {},
+                DEFAULT_QUANTIZATION_CONFIG,
+            )
+
+        # auto quantization is needs specific config for each type
+        # if self.auto_quantization is not None:
+        #     self.auto_quantization_config = OmegaConf.merge(
+        #         self.auto_quantization_config or {},
+        #         DEFAULT_QUANTIZATION_CONFIG,
+        #     )
+
+        if self.quantization_config is not None:
+            self.calibration = self.quantization_config["is_static"]
+
+        if self.auto_quantization_config is not None:
+            self.calibration = self.auto_quantization_config["is_static"]
+
+        if self.calibration:
+            self.calibration_config = OmegaConf.merge(
+                self.calibration_config or {},
+                DEFAULT_CALIBRATION_CONFIG,
+            )
+
+        if self.device_id is not None:
+            LOGGER.warning(
+                "device_id is deprecated, please use provider_options instead"
+            )
+            self.provider_options = OmegaConf.merge(
+                self.provider_options or {},
+                {"device_id": self.device_id},
+            )
+
+        if self.enable_profiling is not None:
+            LOGGER.warning(
+                "enable_profiling is deprecated, please use session_options instead"
+            )
+            self.session_options = OmegaConf.merge(
+                self.session_options or {},
+                {"enable_profiling": self.enable_profiling},
+            )
+
 
 class ORTBackend(Backend):
+    name: str = "onnxruntime"
+    config: ORTConfig
+
     def __init__(
-        self, model: str, task: str, device: str, hub_kwargs: DictConfig
+        self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any]
     ) -> None:
         super().__init__(model, task, device, hub_kwargs)
-
-        from optimum.pipelines import ORT_SUPPORTED_TASKS
+        self.device = torch.device(device)
 
         if self.task == "stable-diffusion":
             self.ortmodel_class = get_class(
@@ -189,60 +206,51 @@ def __init__(
     def configure(self, config: ORTConfig) -> None:
         super().configure(config)
 
-        import onnxruntime
-
         # session options
-        self.session_options = onnxruntime.SessionOptions()
-        if config.intra_op_num_threads is not None:
+        session_options = SessionOptions()
+        if self.config.intra_op_num_threads is not None:
             LOGGER.info(
-                f"\t+ Setting onnxruntime session intra_op_num_threads({config.intra_op_num_threads})"
+                f"\t+ Setting intra_op_num_threads({config.intra_op_num_threads})"
             )
-            self.session_options.intra_op_num_threads = config.intra_op_num_threads
-        if config.inter_op_num_threads is not None:
-            LOGGER.info(
-                f"\t+ Setting onnxruntime session inter_op_num_threads({config.inter_op_num_threads})"
+            self.config.session_options.intra_op_num_threads = (
+                self.config.intra_op_num_threads
             )
-            self.session_options.inter_op_num_threads = config.inter_op_num_threads
-        if config.enable_profiling:
-            LOGGER.info("\t+ Enabling onnxruntime profiling")
-            self.session_options.enable_profiling = True
-
-        # provider options
-        self.provider_options = {}
-        if config.device_id is not None:
+        if self.config.inter_op_num_threads is not None:
             LOGGER.info(
-                f"\t+ Setting onnxruntime provider device_id({config.device_id})"
+                f"\t+ Setting inter_op_num_threads({config.inter_op_num_threads})"
             )
-            self.provider_options["device_id"] = config.device_id
+            self.config.session_options.inter_op_num_threads = (
+                self.config.inter_op_num_threads
+            )
+        for key, value in self.config.session_options.items():
+            setattr(session_options, key, value)
+        self.config.session_options = session_options
 
         # Set torch dtype
-        self.torch_dtype = (
-            getattr(torch, config.torch_dtype)  # in case of torch.dtype
-            if config.torch_dtype is not None and hasattr(torch, config.torch_dtype)
-            else config.torch_dtype
-        )
-        LOGGER.info(
-            f"\t+ Using torch dtype({self.torch_dtype}) for weights loading and export"
+        self.config.torch_dtype = (
+            getattr(torch, self.config.torch_dtype)  # in case of torch.dtype
+            if self.config.torch_dtype is not None
+            and hasattr(torch, self.config.torch_dtype)
+            else self.config.torch_dtype
         )
 
         with TemporaryDirectory() as tmpdirname:
-            if config.use_ortmodel:
-                if config.no_weights:
-                    self.load_ortmodel_from_config(config, tmpdirname)
+            if self.config.use_ortmodel:
+                if self.config.no_weights:
+                    self.load_ortmodel_from_config(tmpdirname)
                 else:
-                    self.load_ortmodel_from_pretrained(config, tmpdirname)
+                    self.load_ortmodel_from_pretrained(tmpdirname)
             else:
-                if config.no_weights:
-                    self.load_automodel_from_config(config)
+                if self.config.no_weights:
+                    self.load_automodel_from_config()
                 else:
-                    self.load_automodel_from_pretrained(config)
+                    self.load_automodel_from_pretrained()
 
-    def load_ortmodel_from_config(self, config: ORTConfig, tmpdirname: str) -> None:
-        LOGGER.info(
-            f"\t+ Loading model from config in {config.torch_dtype} on {self.device}"
-        )
+    def load_ortmodel_from_config(self, tmpdirname: str) -> None:
+        LOGGER.info("\t+ Creating random weights model")
+        self.load_automodel_from_config()
 
-        self.load_automodel_from_config(config)
+        LOGGER.info("\t+ Exporting model to onnx")
         main_export(
             model_name_or_path=self.model,
             output=f"{tmpdirname}/exported_model",
@@ -250,10 +258,9 @@ def load_ortmodel_from_config(self, config: ORTConfig, tmpdirname: str) -> None:
             # we're using but will add "-with-past" when possible
             task="auto",
             device=self.device.type,
-            fp16=self.torch_dtype == torch.float16,
-            optimize=config.auto_optimization,
-            no_post_process=not config.use_merged,
-            for_ort=True,
+            fp16=self.config.torch_dtype == torch.float16,
+            optimize=self.config.auto_optimization,
+            no_post_process=not self.config.use_merged,
             do_validation=False,
             **self.hub_kwargs,
             # we hijack the model instantiation and use our random weights model
@@ -261,17 +268,17 @@ def load_ortmodel_from_config(self, config: ORTConfig, tmpdirname: str) -> None:
         )
         self.delete_pretrained_model()
 
-        LOGGER.info("\t+ Loading exported model in onnxruntime")
+        LOGGER.info("\t+ Loading exported model with ORTModel")
         self.pretrained_model = self.ortmodel_class.from_pretrained(
             model_id=f"{tmpdirname}/exported_model",
-            session_options=self.session_options,
-            use_io_binding=config.use_io_binding,
-            provider=config.provider,
-            provider_options=self.provider_options,
+            session_options=self.config.session_options,
+            use_io_binding=self.config.use_io_binding,
+            provider=self.config.provider,
+            provider_options=self.config.provider_options,
             **(
                 {
-                    "use_merged": config.use_merged,
-                    "use_cache": config.use_cache,
+                    "use_merged": self.config.use_merged,
+                    "use_cache": self.config.use_cache,
                 }
                 if self.is_text_generation_model()
                 else {}
@@ -280,31 +287,36 @@ def load_ortmodel_from_config(self, config: ORTConfig, tmpdirname: str) -> None:
             **self.hub_kwargs,
         )
 
-        if config.optimization:
+        if self.config.optimization:
             raise NotImplementedError(
-                "Only AutoOptimization is supported when loading a model with random weights"
+                "Only AutoOptimization is supported when "
+                "loading a model with random weights"
             )
 
-        if config.quantization or config.auto_quantization is not None:
-            self.quantize(config, tmpdirname)
+        if self.config.quantization or self.config.auto_quantization is not None:
+            self.quantize(tmpdirname)
 
-    def load_ortmodel_from_pretrained(self, config: ORTConfig, tmpdirname: str) -> None:
-        if self.torch_dtype is not None and self.torch_dtype != torch.float32:
+    def load_ortmodel_from_pretrained(self, tmpdirname: str) -> None:
+        if (
+            self.config.torch_dtype is not None
+            and self.config.torch_dtype != torch.float32
+        ):
             raise NotImplementedError(
-                "Loading from pretrained is only supported with torch_dtype float32 for now"
+                "Loading with ORTModel is only supported "
+                "with torch_dtype float32 for now"
             )
 
         self.pretrained_model = self.ortmodel_class.from_pretrained(
             model_id=self.model,
-            session_options=self.session_options,
-            use_io_binding=config.use_io_binding,
-            provider=config.provider,
-            provider_options=self.provider_options,
-            export=config.export,
+            session_options=self.config.session_options,
+            use_io_binding=self.config.use_io_binding,
+            provider=self.config.provider,
+            provider_options=self.config.provider_options,
+            export=self.config.export,
             **(
                 {
-                    "use_merged": config.use_merged,
-                    "use_cache": config.use_cache,
+                    "use_merged": self.config.use_merged,
+                    "use_cache": self.config.use_cache,
                 }
                 if self.is_text_generation_model()
                 else {}
@@ -312,28 +324,28 @@ def load_ortmodel_from_pretrained(self, config: ORTConfig, tmpdirname: str) -> N
             **self.hub_kwargs,
         )
 
-        if config.optimization or config.auto_optimization is not None:
-            self.optimize(config, tmpdirname)
+        if self.config.optimization or self.config.auto_optimization is not None:
+            self.optimize(tmpdirname)
 
-        if config.quantization or config.auto_quantization is not None:
-            self.quantize(config, tmpdirname)
+        if self.config.quantization or self.config.auto_quantization is not None:
+            self.quantize(tmpdirname)
 
-    def optimize(self, config: ORTConfig, tmpdirname: str) -> None:
-        if config.auto_optimization is not None:
-            LOGGER.info(f"\t+ Using auto optimization {config.auto_optimization}")
+    def optimize(self, tmpdirname: str) -> None:
+        if self.config.auto_optimization is not None:
+            LOGGER.info(f"\t+ Using auto optimization {self.config.auto_optimization}")
             optimization_dict = OmegaConf.to_container(
-                config.auto_optimization_config, resolve=True
+                self.config.auto_optimization_config, resolve=True
             )
             LOGGER.info("\t+ Setting auto optimization parameters:")
             for key, value in optimization_dict.items():  # type: ignore
                 LOGGER.info(f"\t\t+ {key}: {value}")
 
             optimization_config = AutoOptimizationConfig.with_optimization_level(
-                optimization_level=config.auto_optimization, **optimization_dict
+                optimization_level=self.config.auto_optimization, **optimization_dict
             )
         else:
             optimization_dict = OmegaConf.to_container(
-                config.optimization_config, resolve=True
+                self.config.optimization_config, resolve=True
             )
             LOGGER.info("\t+ Setting optimization parameters:")
             for key, value in optimization_dict.items():  # type: ignore
@@ -351,32 +363,28 @@ def optimize(self, config: ORTConfig, tmpdirname: str) -> None:
         LOGGER.info("\t+ Loading optimized model")
         self.pretrained_model = self.ortmodel_class.from_pretrained(
             model_id=f"{tmpdirname}/optimized",
-            session_options=self.session_options,
-            use_io_binding=config.use_io_binding,
-            provider=config.provider,
-            provider_options=self.provider_options,
+            session_options=self.config.session_options,
+            use_io_binding=self.config.use_io_binding,
+            provider=self.config.provider,
+            provider_options=self.config.provider_options,
         )
 
-    def quantize(self, config: ORTConfig, tmpdirname: str) -> None:
-        if config.auto_quantization is not None:
-            LOGGER.info(
-                f"\t+ Using auto quantization {config.auto_quantization} and its config"
-            )
+    def quantize(self, tmpdirname: str) -> None:
+        if self.config.auto_quantization is not None:
+            LOGGER.info(f"\t+ Using auto quantization {self.config.auto_quantization}")
             auto_quantization_config_class = getattr(
-                AutoQuantizationConfig, config.auto_quantization
+                AutoQuantizationConfig, self.config.auto_quantization
             )
             quantization_dict = OmegaConf.to_container(
-                config.auto_quantization_config, resolve=True
+                self.config.auto_quantization_config, resolve=True
             )
             quantization_dict = format_ort_quantization_dict(quantization_dict)
             quantization_config = auto_quantization_config_class(**quantization_dict)
 
         else:
-            LOGGER.info("\t+ Using manual quantization and its config")
-            from optimum_benchmark.backends.utils import format_ort_quantization_dict
-
+            LOGGER.info("\t+ Using manual quantization")
             quantization_dict = OmegaConf.to_container(
-                config.quantization_config, resolve=True
+                self.config.quantization_config, resolve=True
             )
             quantization_dict = format_ort_quantization_dict(quantization_dict)
             quantization_config = QuantizationConfig(**quantization_dict)
@@ -388,22 +396,26 @@ def quantize(self, config: ORTConfig, tmpdirname: str) -> None:
             LOGGER.info(f"\t+ Quantizing {component}")
             quantizer = ORTQuantizer.from_pretrained(model_dir, file_name=component)
 
-            if config.calibration:
-                preprocess_class = get_class(config.calibration_config.preprocess_class)
+            if self.config.calibration:
+                preprocess_class = get_class(
+                    self.config.calibration_config.preprocess_class
+                )
                 preprocess_function = preprocess_class(model_name_or_path=self.model)
 
                 calibration_dataset = quantizer.get_calibration_dataset(
-                    dataset_name=config.calibration_config.dataset_name,
-                    num_samples=config.calibration_config.num_samples,
-                    dataset_config_name=config.calibration_config.dataset_config_name,
-                    dataset_split=config.calibration_config.dataset_split,
+                    dataset_name=self.config.calibration_config.dataset_name,
+                    num_samples=self.config.calibration_config.num_samples,
+                    dataset_config_name=self.config.calibration_config.dataset_config_name,
+                    dataset_split=self.config.calibration_config.dataset_split,
                     preprocess_function=preprocess_function,
                 )
 
-                # Create the calibration configuration containing the parameters related to calibration.
+                # Create the calibration configuration
+                # containing the parameters related to calibration.
                 calibration_config = AutoCalibrationConfig.minmax(calibration_dataset)
 
-                # Perform the calibration step: computes the activations quantization ranges
+                # Perform the calibration step:
+                # computes the activations quantization ranges
                 calibration_tensors_range = quantizer.fit(
                     dataset=calibration_dataset,
                     calibration_config=calibration_config,
@@ -420,29 +432,27 @@ def quantize(self, config: ORTConfig, tmpdirname: str) -> None:
         LOGGER.info("\t+ Loading quantized model")
         self.pretrained_model = self.ortmodel_class.from_pretrained(
             model_id=f"{tmpdirname}/quantized",
-            session_options=self.session_options,
-            use_io_binding=config.use_io_binding,
-            provider=config.provider,
-            provider_options=self.provider_options,
+            session_options=self.config.session_options,
+            use_io_binding=self.config.use_io_binding,
+            provider=self.config.provider,
+            provider_options=self.config.provider_options,
         )
 
-    def load_automodel_from_config(self, config: ORTConfig) -> None:
-        from accelerate import init_empty_weights
-
+    def load_automodel_from_config(self) -> None:
         with init_empty_weights():
             self.pretrained_model = self.automodel_class.from_config(
                 config=self.pretrained_config,
-                torch_dtype=self.torch_dtype,
+                torch_dtype=self.config.torch_dtype,
                 trust_remote_code=self.hub_kwargs.get("trust_remote_code", False),
             )
         self.pretrained_model.to_empty(device=self.device)
         randomize_weights(self.pretrained_model)
 
-    def load_automodel_from_pretrained(self, config: ORTConfig) -> None:
+    def load_automodel_from_pretrained(self) -> None:
         with self.device:
             self.pretrained_model = self.automodel_class.from_pretrained(
                 pretrained_model_name_or_path=self.model,
-                torch_dtype=self.torch_dtype,
+                torch_dtype=self.config.torch_dtype,
                 **self.hub_kwargs,
             )
 
@@ -451,37 +461,45 @@ def prepare_for_profiling(self, input_names: List[str]) -> None:
         LOGGER.info("\t+ Wrapping model inside profiler")
         self.pretrained_model = ORTProfilingWrapper(self.pretrained_model)
 
-    def prepare_for_training(
+    def forward(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput":
+        output = self.pretrained_model(**input, **kwargs)
+
+        return output
+
+    def generate(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput":
+        output = self.pretrained_model.generate(**input, **kwargs)
+        return output
+
+    def train(
         self,
-        training_dataset: Dataset,
-        training_data_collator: Callable,
+        training_dataset: "Dataset",
         training_arguments: Dict[str, Any],
-    ) -> None:
-        LOGGER.info("Preparing model for training")
-        LOGGER.info("\t+ Wrapping model inside trainer")
-
-        from optimum.onnxruntime import ORTTrainer, ORTTrainingArguments
+        training_callbacks: List["TrainerCallback"],
+        training_data_collator: Callable,
+    ) -> "TrainerState":
+        LOGGER.info("\t+ Setting dataset format to `torch`.")
+        training_dataset.set_format(
+            type="torch", columns=list(training_dataset.features.keys())
+        )
 
+        LOGGER.info(
+            "\t+ Wrapping training arguments with "
+            "optimum.onnxruntime.ORTTrainingArguments"
+        )
         training_arguments = ORTTrainingArguments(**training_arguments)
-        self.trainer = ORTTrainer(
+
+        LOGGER.info("\t+ Wrapping model with optimum.onnxruntime.ORTTrainer")
+        trainer = ORTTrainer(
             model=self.pretrained_model,
             args=training_arguments,
+            callbacks=training_callbacks,
             train_dataset=training_dataset,
             data_collator=training_data_collator,
-            feature=self.task,
         )
 
-    def forward(self, input: Dict[str, Tensor], **kwargs) -> Tensor:
-        output = self.pretrained_model(**input, **kwargs)[0]
-
-        return output
-
-    def generate(self, input: Dict[str, Tensor], **kwargs) -> Tensor:
-        output = self.pretrained_model.generate(**input, **kwargs)[0]
-        return output
-
-    def train(self) -> None:
-        LOGGER.info("Training model")
-        results = self.trainer.train()
+        LOGGER.info("\t+ Starting training")
+        trainer.train()
+        LOGGER.info("\t+ Training finished successfully")
+        trainer_state = trainer.state
 
-        return results
+        return trainer_state
diff --git a/optimum_benchmark/backends/openvino.py b/optimum_benchmark/backends/openvino.py
index b60ef7f3a..6e83ed756 100644
--- a/optimum_benchmark/backends/openvino.py
+++ b/optimum_benchmark/backends/openvino.py
@@ -1,19 +1,28 @@
+from typing import Dict, Optional, Any, TYPE_CHECKING
+from tempfile import TemporaryDirectory
+from dataclasses import dataclass
+from logging import getLogger
+
+
 import torch
 import inspect
 from torch import Tensor
-from logging import getLogger
-from omegaconf import DictConfig
-from dataclasses import dataclass, field
+from omegaconf import OmegaConf
 from hydra.utils import get_class
-from typing import Dict, Optional
-from tempfile import TemporaryDirectory
+from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS
+from openvino.runtime import __version__ as openvino_version
+from optimum.intel import OVConfig as OVQuantizationConfig, OVQuantizer
+
+if TYPE_CHECKING:
+    from transformers.modeling_outputs import ModelOutput
 
-try:
-    from openvino.runtime import __version__ as openvino_version
-except ImportError:
-    openvino_version = "Not installed"
 
-from optimum_benchmark.backends.base import Backend, BackendConfig
+from .base import Backend, BackendConfig
+from .utils.openvino_utils import (
+    DEFAULT_QUANTIZATION_CONFIG,
+    DEFAULT_CALIBRATION_CONFIG,
+)
+
 
 LOGGER = getLogger("openvino")
 
@@ -31,40 +40,45 @@ class OVConfig(BackendConfig):
     torch_dtype: Optional[str] = None
 
     # compiling options
-    dynamic_shapes: bool = True
     reshape: bool = False
     half: bool = False
 
     # quantization options
     quantization: bool = False
-    quantization_config: Dict = field(
-        default_factory=lambda: {
-            "compression": None,
-            "input_info": None,
-            "save_onnx_model": False,
-        }
-    )
+    quantization_config: Optional[Dict[str, Any]] = None
 
     # calibration options
-    calibration_config: Dict = field(
-        default_factory=lambda: {
-            "dataset_name": "glue",
-            "num_samples": 300,
-            "dataset_config_name": "sst2",
-            "dataset_split": "train",
-            "preprocess_batch": True,
-            "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor",
-        }
-    )
+    calibration: bool = True
+    calibration_config: Optional[Dict[str, Any]] = None
+
+    def __post_init__(self):
+        assert self.torch_dtype is None or self.torch_dtype == "float32", (
+            "Only float32 is supported for torch_dtype in openvino backend. "
+            f"Got {self.torch_dtype}"
+        )
+
+        if self.quantization:
+            self.quantization_config = OmegaConf.merge(
+                self.quantization_config or {},
+                DEFAULT_QUANTIZATION_CONFIG,
+            )
+
+        if self.calibration:
+            self.calibration_config = OmegaConf.merge(
+                self.calibration_config or {},
+                DEFAULT_CALIBRATION_CONFIG,
+            )
 
 
 class OVBackend(Backend):
+    name: str = "openvino"
+    config: OVConfig
+
     def __init__(
-        self, model: str, task: str, device: str, hub_kwargs: DictConfig
+        self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any]
     ) -> None:
         super().__init__(model, task, device, hub_kwargs)
-
-        from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS
+        self.device = torch.device(device)
 
         self.ovmodel_class = get_class(
             f"optimum.intel.openvino.{_HEAD_TO_AUTOMODELS[self.task]}"
@@ -79,82 +93,73 @@ def configure(self, config: OVConfig) -> None:
         super().configure(config)
 
         # Set torch dtype
-        self.torch_dtype = (
-            getattr(torch, config.torch_dtype)  # in case of torch.dtype
-            if config.torch_dtype is not None and hasattr(torch, config.torch_dtype)
-            else None  # in case of string or None
-        )
-        LOGGER.info(
-            f"\t+ Using torch dtype({self.torch_dtype}) for weights loading and export"
+        self.config.torch_dtype = (
+            getattr(torch, self.config.torch_dtype)
+            if self.config.torch_dtype is not None
+            else None
         )
 
+        if self.config.quantization:
+            self.config.quantization_config = OVQuantizationConfig(
+                **self.config.quantization_config,
+            )
+
         with TemporaryDirectory() as tmpdirname:
-            if config.no_weights:
+            if self.config.no_weights:
                 raise NotImplementedError(
                     "no_weights is not supported for openvino backend"
                 )
             else:
-                self.load_model_from_pretrained(config)
+                self.load_model_from_pretrained()
 
-            if config.quantization:
-                self.quantize(config, tmpdirname)
+            if self.config.quantization:
+                self.quantize(tmpdirname)
 
-        self.reshape = config.reshape
-        if self.reshape:
-            LOGGER.info("\t+ Model input will be reshaped and compiled")
-
-        self.half = config.half
-        if self.half:
-            LOGGER.info("\t+ Model will be converted to half precision and compiled")
-
-    def load_model_from_pretrained(self, config: OVConfig) -> None:
-        if self.torch_dtype is not None and self.torch_dtype != torch.float32:
-            raise NotImplementedError(
-                "Loading from pretrained is only supported with torch_dtype float32 for now"
-            )
+    def load_model_from_pretrained(self) -> None:
         self.pretrained_model = self.ovmodel_class.from_pretrained(
             model_id=self.model,
-            use_merged=config.use_merged,
-            export=config.export,
+            use_merged=self.config.use_merged,
+            export=self.config.export,
             **self.hub_kwargs,
         )
 
-    def quantize(self, config: OVConfig, tmpdirname: str) -> None:
+    def quantize(self, tmpdirname: str) -> None:
         LOGGER.info("\t+ Attempting quantization")
 
-        from optimum.intel import OVConfig as OVQuantizationConfig, OVQuantizer
-
         model = self.automodel_class.from_pretrained(self.model, **self.hub_kwargs)
         quantizer = OVQuantizer.from_pretrained(model)
-        quantization_config = OVQuantizationConfig(
-            **config.quantization_config,
-        )
 
-        preprocess_class = get_class(config.calibration_config.preprocess_class)
+        preprocess_class = get_class(self.config.calibration_config.preprocess_class)
         preprocess_function = preprocess_class(model_name_or_path=self.model)
 
         calibration_dataset = quantizer.get_calibration_dataset(
-            dataset_name=config.calibration_config.dataset_name,
-            num_samples=config.calibration_config.num_samples,
-            dataset_config_name=config.calibration_config.dataset_config_name,
-            dataset_split=config.calibration_config.dataset_split,
+            dataset_name=self.config.calibration_config.dataset_name,
+            num_samples=self.config.calibration_config.num_samples,
+            dataset_config_name=self.config.calibration_config.dataset_config_name,
+            dataset_split=self.config.calibration_config.dataset_split,
             preprocess_function=preprocess_function,
         )
 
         quantizer.quantize(
-            save_directory=f"{tmpdirname}/quantized",
-            quantization_config=quantization_config,
             calibration_dataset=calibration_dataset,
+            save_directory=f"{tmpdirname}/quantized",
+            quantization_config=self.config.quantization_config,
+            # defaults
+            batch_size=1,
+            data_collator=None,
+            remove_unused_columns=True,
+            weights_only=False,
         )
         self.delete_pretrained_model()
 
         LOGGER.info("\t+ Loading quantized model")
         self.pretrained_model = self.ovmodel_class.from_pretrained(
             model_id=f"{tmpdirname}/quantized",
+            use_merged=self.config.use_merged,
         )
 
     def prepare_for_inference(self, input_shapes: Dict[str, int]) -> None:
-        if self.reshape:
+        if self.config.reshape:
             static_shapes = {
                 key: value
                 for key, value in input_shapes.items()
@@ -163,21 +168,21 @@ def prepare_for_inference(self, input_shapes: Dict[str, int]) -> None:
             LOGGER.info(f"\t+ Reshaping model with static shapes: {static_shapes}")
             self.pretrained_model.reshape(**static_shapes)
 
-        if self.half:
-            LOGGER.info(f"\t+ Converting model to half precision")
+        if self.config.half:
+            LOGGER.info("\t+ Converting model to half precision")
             self.pretrained_model.half()
 
-        if self.reshape or self.half:
-            LOGGER.info(f"\t+ Compiling model")
+        if self.config.reshape or self.config.half:
+            LOGGER.info("\t+ Compiling model")
             self.pretrained_model.compile()
 
-    def forward(self, input: Dict[str, Tensor], **kwargs) -> Tensor:
-        output = self.pretrained_model(**input, **kwargs)[0]
+    def forward(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput":
+        output = self.pretrained_model(**input, **kwargs)
 
         return output
 
-    def generate(self, input: Dict[str, Tensor], **kwargs) -> Tensor:
-        output = self.pretrained_model.generate(**input, **kwargs)[0]
+    def generate(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput":
+        output = self.pretrained_model.generate(**input, **kwargs)
 
         return output
 
diff --git a/optimum_benchmark/backends/pytorch.py b/optimum_benchmark/backends/pytorch.py
index 72877a5f9..b6c84f181 100644
--- a/optimum_benchmark/backends/pytorch.py
+++ b/optimum_benchmark/backends/pytorch.py
@@ -1,30 +1,38 @@
 from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING
-from omegaconf import DictConfig, OmegaConf
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from logging import getLogger
-from datasets import Dataset
-from torch import Tensor
-import torch
 import os
-import time
+import gc
 
-from torch.distributed.launcher.api import elastic_launch, LaunchConfig
-from torch.distributed.elastic.multiprocessing import Std
-import logging.config
 
-from transformers.utils import ModelOutput
-from transformers import Trainer, TrainingArguments, TrainerCallback
+import torch
+from torch import Tensor
+from accelerate import init_empty_weights
+from omegaconf import DictConfig, OmegaConf
+from torch import __version__ as torch_version
 from transformers.utils.fx import symbolic_trace
-from transformers.trainer_utils import TrainOutput
+from transformers import Trainer, TrainingArguments
 from optimum.bettertransformer import BetterTransformer
+from transformers import BitsAndBytesConfig, GPTQConfig
+from torch.distributed.elastic.multiprocessing.errors import record
+from torch.distributed.launcher.api import elastic_launch, LaunchConfig
 
-from optimum_benchmark.backends.base import Backend, BackendConfig
-from optimum_benchmark.profilers.fx_profiler import FXProfilingWrapper
 
 if TYPE_CHECKING:
-    from transformers import TrainerState, TrainerControl
+    from datasets import Dataset
+    from transformers.utils import ModelOutput
+    from transformers import TrainerState, TrainerCallback
+
+
+from .base import Backend, BackendConfig
+from ..profilers.fx_profiler import FXProfilingWrapper
+from .utils.pytorch_utils import (
+    DEFAULT_COMPILE_CONFIG,
+    DEFAULT_DDP_CONFIG,
+    randomize_weights,
+    get_worker_logger,
+)
 
-WARMUP_STEPS = 40
 
 # bachend logger
 LOGGER = getLogger("pytorch")
@@ -38,32 +46,25 @@
 @dataclass
 class PyTorchConfig(BackendConfig):
     name: str = "pytorch"
-    version: str = torch.__version__
+    version: str = torch_version
     _target_: str = "optimum_benchmark.backends.pytorch.PyTorchBackend"
 
     # load options
     no_weights: bool = False
-    torch_dtype: Optional[str] = None
     device_map: Optional[str] = None
+    torch_dtype: Optional[str] = None
 
     # quantization options
-    load_in_8bit: bool = False
-    load_in_4bit: bool = False
+    quantization_strategy: Optional[str] = None
+    quantization_config: Optional[Dict[str, Any]] = None
 
     # optimization options
     bettertransformer: bool = False
 
     # compilation options
     torch_compile: bool = False
-    torch_compile_config: Dict = field(default_factory=lambda: {
-            "fullgraph": False,
-            "dynamic": False,
-            "backend": "inductor",
-            "mode": None,
-            "options": None,
-            "disable": False,
-        }
-    )
+    torch_compile_kwargs: Optional[Dict] = None
+
     # amp options
     amp_autocast: bool = False
     amp_dtype: Optional[str] = None
@@ -72,10 +73,88 @@ class PyTorchConfig(BackendConfig):
     disable_grad: bool = "${is_inference:${benchmark.name}}"  # type: ignore
     eval_mode: bool = "${is_inference:${benchmark.name}}"  # type: ignore
 
+    # training options
+    use_ddp: bool = False
+    ddp_config: Optional[Dict[str, Any]] = None
+
+    def __post_init__(self):
+        """
+        Here we perform checks and transformations on the config.
+        But we never modify the types of the config values.
+        """
+
+        CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+
+        if self.torch_compile:
+            self.torch_compile_kwargs = OmegaConf.merge(
+                self.torch_compile_kwargs
+                if self.torch_compile_kwargs is not None
+                else {},
+                DEFAULT_COMPILE_CONFIG,
+            )
+
+        if self.device_map is not None:
+            assert self.device_map in ["auto", "sequential"], (
+                "`device_map` must be one of ['auto', 'sequential']. "
+                "are supported in Optimum-Bnechmark. "
+                f"Got {type(self.device_map)} instead."
+            )
+            assert (
+                CUDA_VISIBLE_DEVICES is not None
+            ), "`device_map` can only be used when CUDA_VISIBLE_DEVICES is set."
+
+        if self.torch_dtype is not None:
+            assert self.torch_dtype in ["bfloat16", "float16", "float32", "auto"], (
+                "`torch_dtype` must be one of ['bfloat16', 'float16', 'float32', "
+                f"'auto']. Got {self.torch_dtype} instead."
+            )
+
+        if self.amp_dtype is not None:
+            assert self.amp_dtype in ["bfloat16", "float16", "float32"], (
+                "`amp_dtype` must be one of ['bfloat16', 'float16', 'float32']. "
+                f"Got {self.amp_dtype} instead."
+            )
+
+        if self.quantization_strategy is not None:
+            assert self.quantization_strategy in ["bnb", "gptq"], (
+                "`quantization_strategy` must be one of ['bnb', 'gptq']. "
+                f"Got {self.quantization_strategy} instead."
+            )
+            if self.quantization_strategy == "gptq":
+                bits = self.quantization_config.get("bits", None)
+                assert bits is not None, (
+                    "`quantization_config.bits` must be provided "
+                    "when using 'gptq' quantization strategy."
+                )
+        else:
+            self.quantization_config = None
+
+        if self.use_ddp:
+            self.ddp_config = OmegaConf.merge(
+                self.ddp_config if self.ddp_config is not None else {},
+                DEFAULT_DDP_CONFIG,
+            )
+
+            # TODO: support multi-node training.
+            assert self.ddp_config.max_nodes == 1, (
+                "Currently, PyTorch DDP training benchmark "
+                "only supports training on a single node."
+            )
+
+            assert (
+                CUDA_VISIBLE_DEVICES is not None
+            ), "Pytorch DDP training benchmark requires CUDA_VISIBLE_DEVICES to be set."
+        else:
+            self.ddp_config = None
+
 
 class PyTorchBackend(Backend):
+    name: str = "pytorch"
+    config: PyTorchConfig
+
     def __init__(self, model: str, task: str, device: str, hub_kwargs: DictConfig):
         super().__init__(model, task, device, hub_kwargs)
+        self.device = torch.device(device)
 
         LOGGER.info(
             f"\t+ Infered AutoModel class {self.automodel_class.__name__} "
@@ -86,94 +165,132 @@ def configure(self, config: PyTorchConfig) -> None:
         super().configure(config)
 
         # environment options
-        if config.inter_op_num_threads is not None:
+        if self.config.inter_op_num_threads is not None:
             LOGGER.info(
-                f"\t+ Setting pytorch inter_op_num_threads({config.inter_op_num_threads}))"
+                "\t+ Setting pytorch "
+                f"inter_op_num_threads({self.config.inter_op_num_threads}))"
             )
-            torch.set_num_threads(config.inter_op_num_threads)
-
-        if config.intra_op_num_threads is not None:
+            torch.set_num_threads(self.config.inter_op_num_threads)
+        if self.config.intra_op_num_threads is not None:
             LOGGER.info(
-                f"\t+ Setting pytorch intra_op_num_threads({config.intra_op_num_threads}))"
+                "\t+ Setting pytorch "
+                f"intra_op_num_threads({self.config.intra_op_num_threads}))"
             )
-            torch.set_num_interop_threads(config.intra_op_num_threads)
+            torch.set_num_interop_threads(self.config.intra_op_num_threads)
+
+        # Load config
+        if self.config.torch_dtype is not None:
+            if hasattr(torch, self.config.torch_dtype):
+                self.config.torch_dtype = getattr(torch, self.config.torch_dtype)
 
-        # Disable gradients
-        if config.disable_grad:
+        # Inference config
+        if self.config.disable_grad:
             LOGGER.info("\t+ Disabling gradients")
             # everything that comes after this will have its gradients disabled
             torch.set_grad_enabled(False)
-
-        # Set torch dtype
-        self.torch_dtype = (
-            getattr(torch, config.torch_dtype)  # in case of torch.dtype
-            if config.torch_dtype is not None and hasattr(torch, config.torch_dtype)
-            else config.torch_dtype  # in case of string or None
-        )
+        if self.config.amp_dtype is not None:
+            if hasattr(torch, self.config.amp_dtype):
+                self.config.amp_dtype = getattr(torch, self.config.amp_dtype)
+
+        # Quantization config
+        if self.config.quantization_strategy is not None:
+            if self.config.quantization_strategy == "gptq":
+                self.config.quantization_config = GPTQConfig(
+                    **self.config.quantization_config
+                )
+            elif self.config.quantization_strategy == "bnb":
+                self.config.quantization_config = BitsAndBytesConfig(
+                    **self.config.quantization_config
+                )
 
         # Load model
-        if config.no_weights:
-            self.load_model_from_config(config)
+        if self.config.no_weights:
+            self.load_model_from_config()
         else:
-            self.load_model_from_pretrained(config)
+            self.load_model_from_pretrained()
 
         # Turn on eval mode
-        if config.eval_mode and self.task not in [
-            "stable-diffusion",
-            "stable-diffusion-xl",
-        ]:
+        if not self.is_diffusion_pipeline() and self.config.eval_mode:
             LOGGER.info("\t+ Turning on eval mode")
             self.pretrained_model.eval()
 
-        # Turn on better transformer inference
-        if config.bettertransformer:
+        # Turn on BetterTransformer optimizations
+        if self.config.bettertransformer:
             LOGGER.info("\t+ Using optimum.bettertransformer")
-            self.pretrained_model = BetterTransformer.transform(  # type: ignore
-                self.pretrained_model, keep_original_model=False
+            self.pretrained_model = BetterTransformer.transform(
+                self.pretrained_model,
+                keep_original_model=False,
             )
 
         # Compile model
-        if config.torch_compile:
-            LOGGER.info("\t+ Using torch.compile on forward pass")
-            self.pretrained_model.forward = torch.compile(
-                self.pretrained_model.forward,
-                **config.torch_compile_config,
-            )
+        if self.config.torch_compile:
+            if self.is_diffusion_pipeline():
+                LOGGER.info()
+                self.pretrained_model.unet = torch.compile(
+                    self.pretrained_model.unet,
+                    **self.config.torch_compile_kwargs,
+                )
+            else:
+                LOGGER.info("\t+ Using torch.compile on forward pass")
+                self.pretrained_model.forward = torch.compile(
+                    self.pretrained_model.forward,
+                    **self.config.torch_compile_kwargs,
+                )
 
-        # pytorch autocast
-        if config.amp_autocast:
-            LOGGER.info(
-                f"\t+ Enabling Automatic Mixed Precision with dtype: {self.amp_dtype}"
-            )
-        self.amp_autocast = config.amp_autocast
-        self.amp_dtype = (
-            getattr(torch, config.amp_dtype)  # in case of torch.dtype
-            if config.amp_dtype is not None and hasattr(torch, config.amp_dtype)
-            else None
-        )
+        # DDP config
+        if self.config.use_ddp:
+            self.config.ddp_config = LaunchConfig(**self.config.ddp_config)
 
-    def load_model_from_config(self, config: PyTorchConfig) -> None:
-        LOGGER.info(
-            f"\t+ Loading model from config in dtype : "
-            f"{config.torch_dtype if config.torch_dtype is not None else 'default'} "
-            "on meta device"
-        )
+    def load_model_from_pretrained(self) -> None:
+        LOGGER.info(f"\t+ Loading pretrained model weights on device: {self.device}")
+        if self.is_diffusion_pipeline():
+            self.pretrained_model = self.automodel_class.from_pretrained(
+                pretrained_model_name_or_path=self.model,
+                torch_dtype=self.config.torch_dtype,
+                device_map=self.config.device_map,
+                **self.hub_kwargs,
+            )
+            if self.config.device_map is None:
+                # Diffusers does not support device_map being a torch.device,
+                # thus if not provided we move to device here.
+                self.pretrained_model.to(self.device)
+        else:
+            if self.config.device_map is not None:
+                self.pretrained_model = self.automodel_class.from_pretrained(
+                    pretrained_model_name_or_path=self.model,
+                    quantization_config=self.config.quantization_config,
+                    torch_dtype=self.config.torch_dtype,
+                    device_map=self.config.device_map,
+                    **self.hub_kwargs,
+                )
+            else:
+                with self.device:
+                    self.pretrained_model = self.automodel_class.from_pretrained(
+                        pretrained_model_name_or_path=self.model,
+                        quantization_config=self.config.quantization_config,
+                        torch_dtype=self.config.torch_dtype,
+                        **self.hub_kwargs,
+                    )
 
-        from accelerate import init_empty_weights
-        from optimum_benchmark.backends.utils import (
-            randomize_weights,
-            quantize_dummy_model,
-        )
+    def load_model_from_config(self) -> None:
+        # TODO: create no_weights tests
 
         LOGGER.info("\t+ Initializing empty weights model on device: meta")
         with init_empty_weights():
             self.pretrained_model = self.automodel_class.from_config(
                 config=self.pretrained_config,
-                torch_dtype=self.torch_dtype,
+                torch_dtype=self.config.torch_dtype,
                 trust_remote_code=self.hub_kwargs.get("trust_remote_code", False),
             )
 
-        if config.load_in_8bit or config.load_in_4bit:
+        if self.config.quantization_strategy is None:
+            LOGGER.info(f"\t+ Materializing model on device: {self.device}")
+            self.pretrained_model.to_empty(device=self.device)
+
+            LOGGER.info("\t+ Randomizing model weights")
+            randomize_weights(self.pretrained_model)
+            self.pretrained_model.tie_weights()
+        else:
             LOGGER.info("\t+ Materializing model on device: cpu")
             self.pretrained_model.to_empty(device="cpu")
 
@@ -181,84 +298,39 @@ def load_model_from_config(self, config: PyTorchConfig) -> None:
             randomize_weights(self.pretrained_model)
             self.pretrained_model.tie_weights()
 
-            from accelerate.utils import BnbQuantizationConfig
+            if self.config.quantization_strategy == "bnb":
+                quantization_config = BitsAndBytesConfig(**self.quantization_config)
+            elif self.config.quantization_strategy == "gptq":
+                raise NotImplementedError(
+                    "GPTQ requires a pretrained model to be loaded. "
+                    "`no_weights` option is not supported with GPTQ."
+                )
+
+            from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model
 
+            # translating transformers bnb config to accelerate bnb config
             bnb_quantization_config = BnbQuantizationConfig(
-                load_in_4bit=config.load_in_4bit,
-                load_in_8bit=config.load_in_8bit,
+                load_in_4bit=quantization_config.load_in_4bit,
+                load_in_8bit=quantization_config.load_in_8bit,
+                # with dummy_weights, we set this to 0 for reproducibility
                 llm_int8_threshold=0,
-                torch_dtype=self.torch_dtype,
+                torch_dtype=self.config.torch_dtype,
                 keep_in_fp32_modules=self.pretrained_model.keep_in_fp32_modules
                 if hasattr(self.pretrained_model, "keep_in_fp32_modules")
                 else None,
             )
 
-            LOGGER.info("\t+ Quantizing model while on device: cpu")
-            self.pretrained_model = quantize_dummy_model(
+            LOGGER.info("\t+ Quantizing model while on cpu and dispatching to device")
+            self.pretrained_model = load_and_quantize_model(
                 model=self.pretrained_model,
                 bnb_quantization_config=bnb_quantization_config,
+                device_map=self.config.device_map
+                if self.config.device_map is not None
+                else self.device,
             )
 
-            LOGGER.info(f"\t+ Moving model to device: {self.device}")
-            self.pretrained_model.to(self.device)
-            self.pretrained_model.tie_weights()
-
-        else:
-            LOGGER.info(f"\t+ Materializing model on device: {self.device}")
-            self.pretrained_model.to_empty(device=self.device)
-
-            LOGGER.info("\t+ Randomizing model weights")
-            randomize_weights(self.pretrained_model)
-            self.pretrained_model.tie_weights()
-
-    def load_model_from_pretrained(self, config: PyTorchConfig) -> None:
-        LOGGER.info(
-            f"\t+ Loading pretrained model weights in dtype: {config.torch_dtype} on device: {self.device}"
-        )
-        if self.task not in ["stable-diffusion", "stable-diffusion-xl"]:
-            kwargs = {}
-            if config.load_in_8bit:
-                kwargs["load_in_8bit"] = config.load_in_8bit
-                kwargs["llm_int8_threshold"] = 0
-            elif config.load_in_4bit:
-                kwargs["load_in_4bit"] = config.load_in_4bit
-            
-            if config.device_map:
-                kwargs["device_map"] = config.device_map if config.device_map is not None else self.device
-
-                self.pretrained_model = self.automodel_class.from_pretrained(
-                    pretrained_model_name_or_path=self.model,
-                    torch_dtype=self.torch_dtype,
-                    **kwargs,
-                    **self.hub_kwargs,
-                )
-            else:
-                # When a device_map is not specified, we do not rely on accelerate to load the load and rather try PyTorch-native context.
-                with self.device:
-                    self.pretrained_model = self.automodel_class.from_pretrained(
-                        pretrained_model_name_or_path=self.model,
-                        torch_dtype=self.torch_dtype,
-                        **kwargs,
-                        **self.hub_kwargs,
-                    )
-        else:
-            self.pretrained_model = self.automodel_class.from_pretrained(
-                pretrained_model_name_or_path=self.model,
-                torch_dtype=self.torch_dtype,
-                device_map=config.device_map,
-                **self.hub_kwargs,
-            )
-            if config.device_map is None:
-                # Diffusers does not support device_map being a torch.device, thus if not provided, move to device here.
-                self.pretrained_model.to(self.device)
-
-    def prepare_for_profiling(
-        self,
-        input_names: List[str],
-        input_shapes: Dict[str, int],
-    ) -> None:
+    def prepare_for_profiling(self, input_names: List[str]) -> None:
         LOGGER.info("Preparing model for profiling")
-
         LOGGER.info("\t+ Symbolicly tracing model")
         self.pretrained_model = symbolic_trace(
             model=self.pretrained_model,
@@ -268,128 +340,75 @@ def prepare_for_profiling(
         LOGGER.info("\t+ Wrapping model with FXProfilingWrapper")
         self.pretrained_model = FXProfilingWrapper(self.pretrained_model)
 
-    def forward(self, input: Dict[str, Tensor], **kwargs) -> ModelOutput:
+    def forward(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput":
         with torch.autocast(
+            enabled=self.config.amp_autocast,
             device_type=self.device.type,
-            dtype=self.amp_dtype,
-            enabled=self.amp_autocast,
+            dtype=self.config.amp_dtype,
         ):
             output = self.pretrained_model(**input, **kwargs)
 
         return output
 
-    def generate(self, input: Dict[str, Tensor], **kwargs) -> ModelOutput:
+    def generate(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput":
         with torch.autocast(
+            enabled=self.config.amp_autocast,
             device_type=self.device.type,
-            dtype=self.amp_dtype,
-            enabled=self.amp_autocast,
+            dtype=self.config.amp_dtype,
         ):
             output = self.pretrained_model.generate(**input, **kwargs)
 
         return output
 
-    def train(self) -> None:
-        raise Exception("For PyTorch backend training, please call backend.run_pytorch_training.")
-
-    def run_pytorch_training(self, training_config, training_arguments, training_dataset, training_data_collator):
-        LOGGER.info("Running training benchmark")
-
-        # Converting from DictConfig to Dict is required to avoid a warning with DDP:
-        # `[W CudaIPCTypes.cpp:15] Producer process has been terminated before all shared CUDA tensors released. See Note [Sharing CUDA tensors]`
-        training_arguments_dict = OmegaConf.to_container(training_arguments, resolve=True)
-        
-        if training_config.use_ddp:
-            # TODO: support multi-node training. Hydra is probably not the good infra for that though.
-            if training_config.ddp_config.max_nodes != 1:
-                raise ValueError("PyTorch DDP training benchmark currently supports only training on a single node.")
-                
-            launch_config = LaunchConfig(**training_config.ddp_config)
-            LOGGER.info(f"PyTorch DDP launch config: {launch_config}")
-            
-            # TODO: The backend instance can not be passed here (cannot pickle 'weakref' object) so the nn.Module is passed directly.
-            # It is not clear who is using weakref though.
+    @record
+    def train(
+        self,
+        training_dataset: "Dataset",
+        training_arguments: Dict[str, Any],
+        training_callbacks: List["TrainerCallback"],
+        training_data_collator: Callable,
+    ) -> "TrainerState":
+        args = (
+            self.config.use_ddp,
+            self.pretrained_model,
+            training_dataset,
+            training_arguments,
+            training_callbacks,
+            training_data_collator,
+        )
+
+        if self.config.use_ddp:
+            # For DDP, we log only the stats from the first rank as transformers does.
+            # It could make sense to log for all ranks.
             results = elastic_launch(
-                config=launch_config,
-                entrypoint=ddp_callable,
-            )((self.pretrained_model, training_dataset, training_arguments_dict, training_data_collator, True))
-            
-            # For DDP, we log only the stats from the first rank as transformers does. It could make sense to log for all ranks.
-            results = results[0]
+                config=self.config.ddp_config,
+                entrypoint=training_worker,
+            )(args)[0]
         else:
-            # For simple Data Parallel, we can still use ddp_callable, simply not wrapped by the elastic_launch class.
-            results = ddp_callable((self.pretrained_model, training_dataset, training_arguments_dict, training_data_collator, False))
-        
+            # For DP, we can still use training_worker,
+            # simply not wrapped by the elastic_launch class.
+            results = training_worker(args)
+
         return results
 
+    def clean(self) -> None:
+        super().clean()
 
-def get_logger(name: Optional[str] = None, log_all: bool = False):
-    """
-    PyTorch DDP subprocesses do not inherit from Hydra logger. Thus, we need to reconfigure the logger for the workers.
-    """
-    if os.environ["RANK"] == "0" or log_all:
-        # TODO: also configure logging for other ranks
-        hydra_conf = OmegaConf.load('.hydra/hydra.yaml')
-        logging.config.dictConfig(OmegaConf.to_container(hydra_conf.hydra.job_logging, resolve=True))
-    return getLogger(name)
-
-# Adapted from transformers.trainer_utils.speed_metrics
-def speed_metrics(trainer):
-    """
-    Measure and return speed performance metrics.
-    """
-    # Reference: https://github.com/huggingface/transformers/blob/v4.31.0/src/transformers/trainer.py#L1559
-    total_train_batch_size = trainer._train_batch_size * trainer.args.gradient_accumulation_steps * trainer.args.world_size
-    result = {}
-
-    # Warmup metrics.
-    num_warmup_steps = WARMUP_STEPS
-    num_warmup_samples = num_warmup_steps * total_train_batch_size
-    warmup_runtime = trainer.state.warmup_end - trainer.state.warmup_start
-
-    warmup_samples_per_second = num_warmup_samples / warmup_runtime
-    result["warmup_runtime"] = warmup_runtime
-    result["warmup_samples_per_second"] = round(warmup_samples_per_second, 3)
-    warmup_steps_per_second = num_warmup_steps / warmup_runtime
-    result["warmup_steps_per_second"] = round(warmup_steps_per_second, 3)
-
-    # Training metrics.
-    num_train_steps = trainer.state.max_steps - WARMUP_STEPS
-    num_train_samples = num_train_steps * total_train_batch_size
-    train_runtime = trainer.state.training_end - trainer.state.training_start
-
-    train_samples_per_second = num_train_samples / train_runtime
-    result["train_runtime"] = train_runtime
-    result["train_samples_per_second"] = round(train_samples_per_second, 3)
-    train_steps_per_second = num_train_steps / train_runtime
-    result["train_steps_per_second"] = round(train_steps_per_second, 3)
-
-    return result
-
-class MeasurementCallback(TrainerCallback):
-    def on_step_begin(self, args: TrainingArguments, state: "TrainerState", control: "TrainerControl", **kwargs):
-        if state.global_step == 0:
-            # This check is here because max_steps is set only once the training is launched, thus we can not check before calling trainer.train().
-            if state.max_steps <= WARMUP_STEPS:
-                raise ValueError(f"Total training steps {state.max_steps} is smaller than the number of warmup steps {WARMUP_STEPS}. Please increase the total number of steps (for example by increasing the dataset size).")
-
-            state.warmup_start = time.time_ns() * 1e-9
-        elif state.global_step == WARMUP_STEPS:
-            state.warmup_end = time.time_ns() * 1e-9
-            state.training_start = time.time_ns() * 1e-9
-        elif state.global_step == state.max_steps - 1:
-            state.training_end = time.time_ns() * 1e-9
-        elif state.global_step > state.max_steps - 1:
-            raise ValueError("global_step > state.max_steps - 1")
-
-def ddp_callable(args):
-    pretrained_model = args[0]
-    training_dataset = args[1]
-    training_arguments = args[2]
-    training_data_collator = args[3]
-    use_ddp = args[4]
+        if self.device.type == "cuda":
+            torch.cuda.empty_cache()
+            gc.collect()
+
+
+def training_worker(args) -> "TrainerState":
+    use_ddp = args[0]
+    pretrained_model = args[1]
+    training_dataset = args[2]
+    training_arguments = args[3]
+    training_callbacks = args[4]
+    training_data_collator = args[5]
 
     if use_ddp:
-        LOGGER_WORKER = get_logger("training-ddp-worker", log_all=False)
+        LOGGER_WORKER = get_worker_logger("pytorch-ddp-worker", log_all=False)
 
         env_variables = [
             "RANK",
@@ -398,24 +417,35 @@ def ddp_callable(args):
             "MASTER_PORT",
             "TORCHELASTIC_MAX_RESTARTS",
         ]
+
+        LOGGER_WORKER.info("Initializing DDP worker")
         for env_var in env_variables:
             LOGGER_WORKER.info(f"{env_var}: {os.environ.get(env_var)}")
     else:
         LOGGER_WORKER = LOGGER
 
-    LOGGER_WORKER.info("\t+ Wrapping model with transformers.Trainer")
+    LOGGER_WORKER.info("\t+ Setting dataset format to `torch`.")
+    training_dataset.set_format(
+        type="torch", columns=list(training_dataset.features.keys())
+    )
+
+    LOGGER_WORKER.info(
+        "\t+ Wrapping training arguments with transformers.TrainingArguments"
+    )
     training_arguments = TrainingArguments(**training_arguments)
 
+    LOGGER_WORKER.info("\t+ Wrapping model with transformers.Trainer")
     trainer = Trainer(
         model=pretrained_model,
+        args=training_arguments,
+        callbacks=training_callbacks,
         train_dataset=training_dataset,
         data_collator=training_data_collator,
-        args=training_arguments,
-        callbacks=[MeasurementCallback]
     )
-    
-    LOGGER_WORKER.info("Training model")
+
+    LOGGER_WORKER.info("\t+ Starting training")
     trainer.train()
-    results = speed_metrics(trainer)
+    LOGGER_WORKER.info("\t+ Training finished successfully")
+    trainer_state = trainer.state
 
-    return results
+    return trainer_state
diff --git a/optimum_benchmark/backends/utils/__init__.py b/optimum_benchmark/backends/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/optimum_benchmark/backends/utils/base_utils.py b/optimum_benchmark/backends/utils/base_utils.py
new file mode 100644
index 000000000..7f357be9d
--- /dev/null
+++ b/optimum_benchmark/backends/utils/base_utils.py
@@ -0,0 +1,92 @@
+from typing import Any, Dict, Optional, Union
+
+from diffusers import DiffusionPipeline
+from transformers import (
+    ProcessorMixin,
+    PretrainedConfig,
+    PreTrainedTokenizer,
+    ImageProcessingMixin,
+    FeatureExtractionMixin,
+)
+
+
+PreTrainedProcessor = Union[
+    PreTrainedTokenizer,
+    ImageProcessingMixin,
+    FeatureExtractionMixin,
+    ProcessorMixin,
+]
+
+
+def extract_shapes_from_diffusion_pipeline(
+    pipeline: DiffusionPipeline,
+) -> Dict[str, Any]:
+    # this is the only way I found to extract a diffusion pipeline's "input" shapes
+    shapes = {}
+    if hasattr(pipeline, "vae_encoder") and hasattr(pipeline.vae_encoder, "config"):
+        shapes["num_channels"] = pipeline.vae_encoder.config["out_channels"]
+        shapes["height"] = pipeline.vae_encoder.config["sample_size"]
+        shapes["width"] = pipeline.vae_encoder.config["sample_size"]
+    elif hasattr(pipeline, "vae") and hasattr(pipeline.vae, "config"):
+        shapes["num_channels"] = pipeline.vae.config.out_channels
+        shapes["height"] = pipeline.vae.config.sample_size
+        shapes["width"] = pipeline.vae.config.sample_size
+    else:
+        shapes["num_channels"] = -1
+        shapes["height"] = -1
+        shapes["width"] = -1
+
+    return shapes
+
+
+def extract_shapes_from_model_artifacts(
+    config: PretrainedConfig,
+    processor: Optional[PreTrainedProcessor] = None,
+) -> Dict[str, Any]:
+    shapes = {}
+    artifacts_dict = {}
+
+    config_dict = {k: v for k, v in config.to_dict().items() if v is not None}
+    artifacts_dict.update(config_dict)
+
+    if processor is not None and hasattr(processor, "to_dict"):
+        processor_dict = {k: v for k, v in processor.to_dict().items() if v is not None}
+        artifacts_dict.update(processor_dict)
+
+    # text input
+    shapes["vocab_size"] = artifacts_dict.get("vocab_size", 2)
+    shapes["type_vocab_size"] = artifacts_dict.get("type_vocab_size", 2)
+
+    # image input
+    shapes["num_channels"] = artifacts_dict.get("num_channels", None)
+
+    image_size = artifacts_dict.get("image_size", None)
+    if image_size is None:
+        # processors have different names for the image size
+        image_size = artifacts_dict.get("size", None)
+
+    if isinstance(image_size, (int, float)):
+        shapes["height"] = image_size
+        shapes["width"] = image_size
+    elif isinstance(image_size, (list, tuple)):
+        shapes["height"] = image_size[0]
+        shapes["width"] = image_size[0]
+    elif isinstance(image_size, dict) and len(image_size) == 2:
+        shapes["height"] = list(image_size.values())[0]
+        shapes["width"] = list(image_size.values())[1]
+    elif isinstance(image_size, dict) and len(image_size) == 1:
+        shapes["height"] = list(image_size.values())[0]
+        shapes["width"] = list(image_size.values())[0]
+    else:
+        shapes["height"] = None
+        shapes["width"] = None
+
+    # classification labels (default to 2)
+    shapes["num_labels"] = len(
+        artifacts_dict.get("id2label", {"0": "LABEL_0", "1": "LABEL_1"})
+    )
+
+    # object detection labels (default to 2)
+    shapes["num_queries"] = artifacts_dict.get("num_queries", 2)
+
+    return shapes
diff --git a/optimum_benchmark/backends/utils/neural_compressor_utils.py b/optimum_benchmark/backends/utils/neural_compressor_utils.py
new file mode 100644
index 000000000..96632df48
--- /dev/null
+++ b/optimum_benchmark/backends/utils/neural_compressor_utils.py
@@ -0,0 +1,39 @@
+DEFAULT_QUANTIZATION_CONFIG = {
+    "device": "cpu",
+    "backend": "default",
+    "domain": "auto",
+    "recipes": {},
+    "quant_format": "default",
+    "inputs": [],
+    "outputs": [],
+    "approach": "static",
+    "calibration_sampling_size": [100],
+    "op_type_dict": None,
+    "op_name_dict": None,
+    "reduce_range": None,
+    "example_inputs": None,
+    "excluded_precisions": [],
+    "quant_level": "auto",
+    "accuracy_criterion": {
+        "higher_is_better": True,
+        "criterion": "relative",
+        "tolerable_loss": 0.01,
+    },
+    "tuning_criterion": {
+        "strategy": "basic",
+        "strategy_kwargs": None,
+        "timeout": 0,
+        "max_trials": 100,
+        "objective": "performance",
+    },
+    "diagnosis": False,
+}
+
+DEFAULT_CALIBRATION_CONFIG = {
+    "dataset_name": "glue",
+    "num_samples": 300,
+    "dataset_config_name": "sst2",
+    "dataset_split": "train",
+    "preprocess_batch": True,
+    "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor",
+}
diff --git a/optimum_benchmark/backends/utils/onnxruntime_utils.py b/optimum_benchmark/backends/utils/onnxruntime_utils.py
new file mode 100644
index 000000000..65568458a
--- /dev/null
+++ b/optimum_benchmark/backends/utils/onnxruntime_utils.py
@@ -0,0 +1,94 @@
+from typing import Any, Dict
+
+
+DEFAULT_OPTIMIZATION_CONFIG = {
+    "optimization_level": 1,  # 0, 1, 2, 99
+    "optimize_for_gpu": "${is_gpu:${device}}",
+    "fp16": False,
+    "enable_transformers_specific_optimizations": True,
+    "enable_gelu_approximation": False,
+    "disable_gelu_fusion": False,
+    "disable_layer_norm_fusion": False,
+    "disable_attention_fusion": False,
+    "disable_skip_layer_norm_fusion": True,
+    "disable_bias_skip_layer_norm_fusion": False,
+    "disable_bias_gelu_fusion": False,
+    "use_mask_index": False,
+    "no_attention_mask": False,
+    "disable_embed_layer_norm_fusion": True,
+    "disable_shape_inference": False,
+    "use_multi_head_attention": False,
+    "enable_gemm_fast_gelu_fusion": False,
+    "use_raw_attention_mask": False,
+    "disable_group_norm_fusion": True,
+    "disable_packed_kv": True,
+}
+
+DEFAULT_QUANTIZATION_CONFIG = {
+    "is_static": False,
+    "format": "QOperator",  # QOperator, QDQ
+    "mode": "IntegerOps",  # QLinearOps, IntegerOps
+    "activations_dtype": "QUInt8",  # QInt8, QUInt8
+    "activations_symmetric": False,
+    "weights_dtype": "QInt8",  # QInt8, QUInt8
+    "weights_symmetric": True,
+    "per_channel": False,
+    "reduce_range": False,
+    "operators_to_quantize": [
+        "MatMul",
+        "Add",
+    ],
+}
+
+DEFAULT_CALIBRATION_CONFIG = {
+    "dataset_name": "glue",
+    "num_samples": 300,
+    "dataset_config_name": "sst2",
+    "dataset_split": "train",
+    "preprocess_batch": True,
+    "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor",
+}
+
+
+def infer_device_id(device: str) -> int:
+    """
+    Infer the device id from the given device string.
+    """
+
+    import torch
+
+    if device == "cuda":
+        return torch.cuda.current_device()
+    elif torch.device(device).type == "cuda":
+        return torch.device(device).index
+    elif torch.device(device).type == "cpu":
+        return -1
+    else:
+        raise ValueError(f"Unknown device '{device}'")
+
+
+def format_ort_quantization_dict(quantization_dict: Dict[str, Any]) -> None:
+    """
+    Format the quantization dictionary for onnxruntime.
+    """
+
+    from onnxruntime.quantization import QuantFormat, QuantizationMode, QuantType
+
+    if quantization_dict.get("format", None) is not None:
+        quantization_dict["format"] = QuantFormat.from_string(
+            quantization_dict["format"]
+        )
+    if quantization_dict.get("mode", None) is not None:
+        quantization_dict["mode"] = QuantizationMode.from_string(
+            quantization_dict["mode"]
+        )
+    if quantization_dict.get("activations_dtype", None) is not None:
+        quantization_dict["activations_dtype"] = QuantType.from_string(
+            quantization_dict["activations_dtype"]
+        )
+    if quantization_dict.get("weights_dtype", None) is not None:
+        quantization_dict["weights_dtype"] = QuantType.from_string(
+            quantization_dict["weights_dtype"]
+        )
+
+    return quantization_dict
diff --git a/optimum_benchmark/backends/utils/openvino_utils.py b/optimum_benchmark/backends/utils/openvino_utils.py
new file mode 100644
index 000000000..0f1037b77
--- /dev/null
+++ b/optimum_benchmark/backends/utils/openvino_utils.py
@@ -0,0 +1,14 @@
+DEFAULT_QUANTIZATION_CONFIG = {
+    "compression": None,
+    "input_info": None,
+    "save_onnx_model": False,
+}
+
+DEFAULT_CALIBRATION_CONFIG = {
+    "dataset_name": "glue",
+    "num_samples": 300,
+    "dataset_config_name": "sst2",
+    "dataset_split": "train",
+    "preprocess_batch": True,
+    "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor",
+}
diff --git a/optimum_benchmark/backends/utils.py b/optimum_benchmark/backends/utils/optimum_utils.py
similarity index 57%
rename from optimum_benchmark/backends/utils.py
rename to optimum_benchmark/backends/utils/optimum_utils.py
index 991d35f29..a558f1659 100644
--- a/optimum_benchmark/backends/utils.py
+++ b/optimum_benchmark/backends/utils/optimum_utils.py
@@ -1,277 +1,35 @@
-from typing import Any, Callable, Dict, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
 from pathlib import Path
 import os
 
 import torch
-from optimum.exporters import TasksManager
-from optimum.onnxruntime import ORTOptimizer
-from optimum.utils import DEFAULT_DUMMY_SHAPES
-from transformers.utils import is_torch_available
-from optimum.exporters.onnx.base import OnnxConfig
-from optimum.utils.save_utils import maybe_save_preprocessors
-from optimum.exporters.onnx.constants import UNPICKABLE_ARCHS
-from optimum.utils import DEFAULT_DUMMY_SHAPES, ONNX_WEIGHTS_NAME
-from optimum.onnxruntime.configuration import AutoOptimizationConfig
-from transformers import AutoTokenizer, PreTrainedModel, PretrainedConfig
-from requests.exceptions import ConnectionError as RequestsConnectionError
-from optimum.exporters.error_utils import AtolError, OutputMatchError, ShapeError
-from optimum.exporters.onnx.convert import export_models, validate_models_outputs
-from optimum.exporters.onnx.__main__ import logger, _get_submodels_and_onnx_configs
-from optimum.exporters.onnx import (
-    get_encoder_decoder_models_for_export,
-    get_decoder_models_for_export,
+from optimum.exporters.onnx.__main__ import (
+    logger,
+    TasksManager,
     OnnxConfigWithPast,
+    _get_submodels_and_onnx_configs,
+    maybe_save_preprocessors,
+    validate_models_outputs,
+    is_torch_available,
     export_models,
+    AutoTokenizer,
+    DEFAULT_DUMMY_SHAPES,
+    ONNX_WEIGHTS_NAME,
+    UNPICKABLE_ARCHS,
+    RequestsConnectionError,
+    OutputMatchError,
+    ShapeError,
+    AtolError,
 )
 
 
-def randomize_weights(model):
-    for param in model.parameters():
-        if torch.cuda.is_available() and param.device.type == "cpu":
-            # we take advantage of the fact that a cuda device
-            # is available to use cuda kernels for randomization
-            # this is slower than asynchronous randomization while
-            # model is fully on gpu (because of data transfer) but
-            # faster than randomization while model is on cpu
-            param.data.cuda().normal_(mean=0.0, std=0.2).cpu()
-        else:
-            param.data.normal_(mean=0.0, std=0.2)
-
-
-def format_ort_quantization_dict(quantization_dict: Dict[str, Any]) -> None:
-    from onnxruntime.quantization import (
-        QuantFormat,
-        QuantizationMode,
-        QuantType,
-    )
-
-    if quantization_dict.get("format", None) is not None:
-        quantization_dict["format"] = QuantFormat.from_string(
-            quantization_dict["format"]
-        )
-    if quantization_dict.get("mode", None) is not None:
-        quantization_dict["mode"] = QuantizationMode.from_string(
-            quantization_dict["mode"]
-        )
-    if quantization_dict.get("activations_dtype", None) is not None:
-        quantization_dict["activations_dtype"] = QuantType.from_string(
-            quantization_dict["activations_dtype"]
-        )
-    if quantization_dict.get("weights_dtype", None) is not None:
-        quantization_dict["weights_dtype"] = QuantType.from_string(
-            quantization_dict["weights_dtype"]
-        )
-
-    return quantization_dict
-
-
-def quantize_dummy_model(
-    model,
-    bnb_quantization_config,
-):
-    from accelerate.utils.bnb import (
-        get_keys_to_not_convert,
-        replace_with_bnb_layers,
-        logger,
-    )
-
-    # We keep some modules such as the lm_head in their original dtype for numerical stability reasons
-    if bnb_quantization_config.skip_modules is None:
-        bnb_quantization_config.skip_modules = get_keys_to_not_convert(model)
-
-    # add cpu modules to skip modules only for 4-bit modules
-    modules_to_not_convert = bnb_quantization_config.skip_modules
-
-    # We add the modules we want to keep in full precision
-    if bnb_quantization_config.keep_in_fp32_modules is None:
-        bnb_quantization_config.keep_in_fp32_modules = []
-    keep_in_fp32_modules = bnb_quantization_config.keep_in_fp32_modules
-    modules_to_not_convert.extend(keep_in_fp32_modules)
-
-    # compatibility with peft
-    model.is_loaded_in_4bit = bnb_quantization_config.load_in_4bit
-    model.is_loaded_in_8bit = bnb_quantization_config.load_in_8bit
-
-    # quantization of an already loaded model
-    logger.warning(
-        "It is not recommended to quantize a loaded model. "
-        "The model should be instantiated under the `init_empty_weights` context manager."
-    )
-    model = replace_with_bnb_layers(
-        model, bnb_quantization_config, modules_to_not_convert=modules_to_not_convert
-    )
-    # convert param to the right dtype
-    dtype = bnb_quantization_config.torch_dtype
-    for name, param in model.state_dict().items():
-        if any(
-            module_to_keep_in_fp32 in name
-            for module_to_keep_in_fp32 in keep_in_fp32_modules
-        ):
-            param.to(torch.float32)
-            if param.dtype != torch.float32:
-                name = name.replace(".weight", "").replace(".bias", "")
-                param = getattr(model, name, None)
-                if param is not None:
-                    param.to(torch.float32)
-        elif torch.is_floating_point(param):
-            param.to(dtype)
-
-    return model
-
-
-def export_dummy_model(
-    automodel_class,
-    pretrained_config: PretrainedConfig,
-    output_dir: str,
-    device: torch.device,
-    torch_dtype: Optional[torch.dtype] = None,
-    auto_optimization: Optional[str] = None,
-    use_merged: Optional[bool] = None,
-    **cache_kwargs,
-):
-    ########################################
-    from accelerate import init_empty_weights
-
-    with init_empty_weights():
-        model = automodel_class.from_config(
-            config=pretrained_config,
-            torch_dtype=torch_dtype,
-            trust_remote_code=cache_kwargs.get("trust_remote_code", False),
-        )
-    model.to_empty(device=device)
-    randomize_weights(model)
-    ########################################
-
-    input_shapes = {}
-    original_task = "auto"
-    output_path = Path(output_dir)
-
-    for input_name in DEFAULT_DUMMY_SHAPES.keys():
-        input_shapes[input_name] = DEFAULT_DUMMY_SHAPES[input_name]
-
-    try:
-        task = TasksManager.infer_task_from_model(model)
-    except KeyError as e:
-        raise KeyError(
-            f"The task could not be automatically inferred. Please provide the argument --task with the task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
-        )
-
-    if task + "-with-past" in TasksManager.get_supported_tasks_for_model_type(
-        model.config.model_type.replace("_", "-"), "onnx"
-    ):
-        if (
-            original_task == "auto"
-        ):  # Make -with-past the default if --task was not explicitely specified
-            task = task + "-with-past"
-
-    onnx_config_constructor = TasksManager.get_exporter_config_constructor(
-        model=model, exporter="onnx", task=task
-    )
-    onnx_config = onnx_config_constructor(model.config)
-
-    needs_pad_token_id = (
-        isinstance(onnx_config, OnnxConfigWithPast)
-        and getattr(model.config, "pad_token_id", None) is None
-        and task in ["text-classification"]
-    )
-    if needs_pad_token_id:
-        try:
-            tok = AutoTokenizer.from_pretrained(model.name_or_path)
-            model.config.pad_token_id = tok.pad_token_id
-        except Exception:
-            raise ValueError(
-                "Could not infer the pad token id, which is needed in this case, please provide it with the --pad_token_id argument"
-            )
-
-    opset = onnx_config.DEFAULT_ONNX_OPSET
-    atol = onnx_config.ATOL_FOR_VALIDATION
-    if isinstance(atol, dict):
-        atol = atol[task.replace("-with-past", "")]
-
-    # Saving the model config and preprocessor as this is needed sometimes.
-    model.config.save_pretrained(output_path)
-    generation_config = getattr(model, "generation_config", None)
-    if generation_config is not None:
-        generation_config.save_pretrained(output_path)
-
-    maybe_save_preprocessors(output_path, output_path)
-
-    if model.config.is_encoder_decoder and task.startswith("text-generation"):
-        raise ValueError(
-            f"model.config.is_encoder_decoder is True and task is `{task}`, which are incompatible. If the task was auto-inferred, please fill a bug report"
-            f"at https://github.com/huggingface/optimum, if --task was explicitely passed, make sure you selected the right task for the model,"
-            f" referring to `optimum.exporters.tasks.TaskManager`'s `_TASKS_TO_AUTOMODELS`."
-        )
-
-    onnx_files_subpaths = None
-    if model.config.is_encoder_decoder and task.startswith(
-        (
-            "text2text-generation",
-            "automatic-speech-recognition",
-            "image-to-text",
-            "feature-extraction-with-past",
-        )
-    ):
-        models_and_onnx_configs = get_encoder_decoder_models_for_export(
-            model, onnx_config
-        )
-
-    elif task.startswith("text-generation"):
-        models_and_onnx_configs = get_decoder_models_for_export(model, onnx_config)
-    else:
-        models_and_onnx_configs = {"model": (model, onnx_config)}
-
-    print("Attempting to export the model to ONNX...")
-    _, __ = export_models(
-        models_and_onnx_configs=models_and_onnx_configs,  # type: ignore
-        opset=opset,  # type: ignore
-        output_dir=output_path,
-        output_names=onnx_files_subpaths,
-        input_shapes=input_shapes,
-        device=str(device),
-        dtype="fp16" if torch_dtype == torch.float16 else None,
-    )
-    print("Model successfully exported to ONNX.")
-
-    if auto_optimization:
-        print("Attempting to optimize the exported ONNX models...")
-        if onnx_files_subpaths is None:
-            onnx_files_subpaths = [
-                key + ".onnx" for key in models_and_onnx_configs.keys()
-            ]
-        optimizer = ORTOptimizer.from_pretrained(
-            output_path, file_names=onnx_files_subpaths
-        )
-
-        optimization_config = AutoOptimizationConfig.with_optimization_level(
-            optimization_level=auto_optimization
-        )
-
-        optimizer.optimize(
-            save_dir=output_path,
-            optimization_config=optimization_config,
-            file_suffix="",
-        )
-        print("ONNX models successfully optimized.")
-
-    # post process is disabled in optimum ort api so you need to export models with cli
-    # and then load them with ort api to reproduce the same results
-    if use_merged:
-        try:
-            print("Attempting to merge the exported ONNX models...")
-            (
-                models_and_onnx_configs,
-                onnx_files_subpaths,
-            ) = onnx_config.post_process_exported_models(
-                output_path, models_and_onnx_configs, onnx_files_subpaths
-            )
-            print("ONNX models successfully merged.")
-        except Exception as e:
-            raise Exception(
-                f"The post-processing of the ONNX export failed. The export can still be performed by passing the option --no-post-process. Detailed error: {e}"
-            )
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+    from optimum.exporters.onnx import OnnxConfig
 
 
+# rewrite of the main_export function from optimum.exporters.onnx.__main__
+# to use the model passed in as an argument instead of loading it from the model_name_or_path
 def main_export(
     model_name_or_path: str,
     output: Union[str, Path],
@@ -295,11 +53,11 @@ def main_export(
     for_ort: bool = False,
     do_validation: bool = True,
     model_kwargs: Optional[Dict[str, Any]] = None,
-    custom_onnx_configs: Optional[Dict[str, OnnxConfig]] = None,
+    custom_onnx_configs: Optional[Dict[str, "OnnxConfig"]] = None,
     fn_get_submodels: Optional[Callable] = None,
     use_subprocess: bool = False,
     ########################################
-    model: Optional[PreTrainedModel] = None,
+    model: Optional["PreTrainedModel"] = None,
     ########################################
     **kwargs_shapes,
 ):
diff --git a/optimum_benchmark/backends/utils/pytorch_utils.py b/optimum_benchmark/backends/utils/pytorch_utils.py
new file mode 100644
index 000000000..04a2dbecb
--- /dev/null
+++ b/optimum_benchmark/backends/utils/pytorch_utils.py
@@ -0,0 +1,78 @@
+from logging import getLogger
+from typing import Optional
+import logging.config
+import os
+
+import torch
+from omegaconf import OmegaConf
+from torch.distributed.elastic.multiprocessing import Std
+
+OmegaConf.register_new_resolver("device_count", lambda: torch.cuda.device_count())
+
+
+DEFAULT_COMPILE_CONFIG = {
+    "fullgraph": False,
+    "dynamic": False,
+    "backend": "inductor",
+    "mode": None,
+    "options": None,
+    "disable": False,
+}
+
+# from https://github.com/pytorch/pytorch/blob/v2.0.0/torch/distributed/launcher/api.py#L29
+# adjusted to the defaults of torch.distributed.run
+# defined in https://github.com/pytorch/pytorch/blob/v2.0.0/torch/distributed/run.py#L770
+# TODO: decide wrther to use torch.distributed.run arguments or the ones from
+# torch.distributed.launcher.api
+DEFAULT_DDP_CONFIG = {
+    "min_nodes": 1,
+    "max_nodes": 1,
+    "run_id": "none",
+    "nproc_per_node": "${device_count:}",
+    "role": "default",
+    "rdzv_endpoint": "127.0.0.1:29500",
+    "rdzv_backend": "static",
+    "rdzv_configs": {
+        "timeout": 900,
+        "rank": 0,
+    },
+    "max_restarts": 0,
+    "monitor_interval": 5,
+    "start_method": "spawn",
+    "log_dir": None,
+    "metrics_cfg": {},
+    "local_addr": None,
+    "redirects": Std.NONE,
+    "tee": Std.NONE,
+}
+
+
+def randomize_weights(model):
+    for param in model.parameters():
+        if torch.cuda.is_available() and param.device.type == "cpu":
+            # we take advantage of the fact that a cuda device
+            # is available to use cuda kernels for randomization
+            # this is slower than asynchronous randomization while
+            # model is fully on gpu (because of data transfer) but
+            # faster than randomization while model is on cpu
+            param.data.cuda().normal_(mean=0.0, std=0.2).cpu()
+        else:
+            param.data.normal_(mean=0.0, std=0.2)
+
+
+def get_worker_logger(
+    name: Optional[str] = None,
+    log_all: bool = False,
+) -> logging.Logger:
+    """
+    PyTorch DDP subprocesses do not inherit from Hydra logger.
+    Thus, we need to reconfigure the logger for the workers.
+    """
+    if os.environ["RANK"] == "0" or log_all:
+        # TODO: also configure logging for other ranks
+        hydra_conf = OmegaConf.load(".hydra/hydra.yaml")
+        logging.config.dictConfig(
+            OmegaConf.to_container(hydra_conf.hydra.job_logging, resolve=True)
+        )
+
+    return getLogger(name)
diff --git a/optimum_benchmark/benchmarks/base.py b/optimum_benchmark/benchmarks/base.py
index 9b41ce3b7..da2721e5d 100644
--- a/optimum_benchmark/benchmarks/base.py
+++ b/optimum_benchmark/benchmarks/base.py
@@ -1,9 +1,8 @@
-from dataclasses import dataclass, MISSING
+from dataclasses import dataclass
 from logging import getLogger
 from abc import ABC
 
 from optimum_benchmark.backends.base import Backend
-from optimum_benchmark.utils import set_seed
 
 
 LOGGER = getLogger("benchmark")
@@ -11,25 +10,23 @@
 
 @dataclass
 class BenchmarkConfig(ABC):
-    name: str = MISSING  # type: ignore
-    _target_: str = MISSING  # type: ignore
-
-    # seed for reproducibility
-    seed: int = 42
+    name: str
+    _target_: str
 
 
 class Benchmark(ABC):
+    name: str
+    config: BenchmarkConfig
+
     def __init__(self) -> None:
         pass
 
     def configure(self, config: BenchmarkConfig) -> None:
-        LOGGER.info(f"Configuring {config.name} benchmark")
+        LOGGER.info(f"Configuring {self.name} benchmark")
         self.config = config
-        LOGGER.info(f"\t+ Setting seed({self.config.seed})")
-        set_seed(self.config.seed)
 
     def run(self, backend: Backend) -> None:
         raise NotImplementedError("Benchmark must implement run method")
 
-    def save(self, path: str = "") -> None:
+    def save(self) -> None:
         raise NotImplementedError("Benchmark must implement save method")
diff --git a/optimum_benchmark/benchmarks/inference.py b/optimum_benchmark/benchmarks/inference.py
index 7f8098470..afded80c4 100644
--- a/optimum_benchmark/benchmarks/inference.py
+++ b/optimum_benchmark/benchmarks/inference.py
@@ -1,19 +1,38 @@
 from dataclasses import dataclass, field
-from typing import List, Dict
+from typing import List, Dict, Optional
 from logging import getLogger
+from omegaconf import OmegaConf
+
 
 from pandas import DataFrame
 import statistics
 
-from optimum_benchmark.backends.base import Backend
-from optimum_benchmark.generators.input_generator import InputGenerator
-from optimum_benchmark.benchmarks.base import Benchmark, BenchmarkConfig
-from optimum_benchmark.trackers.memory import memory_tracker_class_for_backend
-from optimum_benchmark.trackers.latency import latency_tracker_class_for_backend
+
+from ..backends.base import Backend
+from .base import Benchmark, BenchmarkConfig
+from ..generators.input_generator import InputGenerator
+from ..utils import TEXT_GENERATION_TASKS, DIFFUSION_TASKS
+from ..trackers.memory import memory_tracker_class_for_backend
+from ..trackers.latency import latency_tracker_class_for_backend
+from .inference_utils import (
+    three_sig_figs,
+    DEFAULT_INPUT_SHAPES,
+    DEFAULT_GENERATE_KWARGS,
+    DEFAULT_DIFUSION_KWARGS,
+)
 
 
 LOGGER = getLogger("inference")
 
+OmegaConf.register_new_resolver(
+    "can_generate",
+    lambda task: task in TEXT_GENERATION_TASKS,
+)
+OmegaConf.register_new_resolver(
+    "can_diffuse",
+    lambda task: task in DIFFUSION_TASKS,
+)
+
 
 @dataclass
 class InferenceConfig(BenchmarkConfig):
@@ -23,34 +42,69 @@ class InferenceConfig(BenchmarkConfig):
     # benchmark options
     memory: bool = False
     warmup_runs: int = 10
-
-    benchmark_duration: int = 10  # TODO: deprecate this and use `benchmark.duration`
+    duration: int = 10
+    # TODO: deprecate this and use `benchmark.duration`
+    benchmark_duration: Optional[int] = None
 
     # input options
     input_shapes: Dict = field(
-        default_factory=lambda: {
-            # used with all tasks
-            "batch_size": 2,
-            # used with text input tasks
-            "sequence_length": 16,
-            # used with multiple choice tasks where input
-            # is of shape (batch_size, num_choices, sequence_length)
-            "num_choices": 1,
-            # used with audio input tasks
-            "feature_size": 80,
-            "nb_max_frames": 3000,
-            "audio_sequence_length": 16000,
-        }
+        default_factory=lambda: DEFAULT_INPUT_SHAPES,
     )
 
+    # TODO: deprecate this and use `benchamrk.generate_kwargs`
+    new_tokens: Optional[int] = None
+
+    # forward options
+    can_diffuse: bool = "${can_diffuse:${task}}"
+    forward_kwargs: Optional[Dict] = None
+
     # generation options
-    new_tokens: int = 100  # TODO: deprecate this and use `benchamrk.generation_options`
+    can_generate: bool = "${can_generate:${task}}"
+    generate_kwargs: Optional[Dict] = None
 
-    # diffusion options
-    # TODO: add `benchmark.diffusion_options` for multiple images per prompt
+    def __post_init__(self):
+        if self.can_generate:
+            self.generate_kwargs = OmegaConf.merge(
+                self.generate_kwargs or {},
+                DEFAULT_GENERATE_KWARGS,
+            )
+
+        if self.can_diffuse:
+            self.forward_kwargs = OmegaConf.merge(
+                self.forward_kwargs or {},
+                DEFAULT_DIFUSION_KWARGS,
+            )
+
+        if self.new_tokens is not None:
+            LOGGER.warning(
+                "The `new_tokens` option is deprecated, please use `generate_kwargs` "
+                "instead. `max_new_tokens` and `min_new_tokens` will be set to the "
+                "value of `new_tokens`."
+            )
+            self.generate_kwargs["max_new_tokens"] = self.new_tokens
+            self.generate_kwargs["min_new_tokens"] = self.new_tokens
+
+        if self.generate_kwargs is not None:
+            assert (
+                self.generate_kwargs["max_new_tokens"]
+                == self.generate_kwargs["min_new_tokens"]
+            ), (
+                "`max_new_tokens` and `min_new_tokens` "
+                "must be equal for fixed length output"
+            )
+
+        if self.benchmark_duration is not None:
+            LOGGER.warning(
+                "The `benchmark_duration` option is deprecated, please use `duration` "
+                "instead. `duration` will be set to the value of `benchmark_duration`."
+            )
+            self.duration = self.benchmark_duration
 
 
 class InferenceBenchmark(Benchmark):
+    name: str = "inference"
+    config: InferenceConfig
+
     def __init__(self):
         # initialize inference results
         self.forward_peak_memory: int = 0
@@ -60,34 +114,30 @@ def __init__(self):
     def configure(self, config: InferenceConfig):
         super().configure(config)
 
-        self.memory = config.memory
-
-        self.warmup_runs = config.warmup_runs
-        self.benchmark_duration = config.benchmark_duration
+        if self.config.forward_kwargs is None:
+            self.config.forward_kwargs = {}
 
-        self.input_shapes = config.input_shapes
-        self.new_tokens = config.new_tokens
+        if self.config.generate_kwargs is None:
+            self.config.generate_kwargs = {}
 
     def run(self, backend: Backend) -> None:
         LOGGER.info("Running inference benchmark")
-
-        self.can_generate = backend.is_text_generation_model()
-        self.input_shapes.update(backend.model_shapes)
+        self.config.input_shapes.update(backend.model_shapes)
 
         self.input_generator = InputGenerator(
             task=backend.task,
-            input_shapes=self.input_shapes,
+            input_shapes=self.config.input_shapes,
             pretrained_config=backend.pretrained_config,
         )
 
-        if self.memory:
+        if self.config.memory:
             # if requested, run memory tracking
             self.run_memory_tracking(backend)
 
         # run forward pass tracking
         self.run_forward_tracking(backend)
 
-        if self.can_generate:
+        if self.config.can_generate:
             # if possible, run generation pass tracking
             self.run_generate_tracking(backend)
 
@@ -96,18 +146,17 @@ def run_memory_tracking(self, backend: Backend) -> None:
             mode="forward",
         )
 
-        # TODO: handle this in backend using prepare_for_inference
         for key, value in memory_input.items():
             if key == "prompt":
                 continue
             memory_input[key] = value.to(backend.device)
 
         # for backends that require compilation with static shapes
-        backend.prepare_for_inference(input_shapes=self.input_shapes)
+        backend.prepare_for_inference(input_shapes=self.config.input_shapes)
 
         LOGGER.info("\t+ Tracking forward pass peak memory")
         memory_tracker = memory_tracker_class_for_backend[backend.config.name](backend)
-        with memory_tracker.track(interval=self.benchmark_duration // 100):
+        with memory_tracker.track(interval=self.config.duration // 100):
             _ = backend.forward(memory_input)
 
         self.forward_peak_memory = memory_tracker.get_peak_memory()
@@ -118,26 +167,25 @@ def run_forward_tracking(self, backend: Backend) -> None:
             mode="forward",
         )
 
-        # TODO: handle this in backend using prepare_for_inference
         for key, value in forward_input.items():
             if key == "prompt":
                 continue
             forward_input[key] = value.to(backend.device)
 
         # for backends that require compilation with static shapes
-        backend.prepare_for_inference(input_shapes=self.input_shapes)
+        backend.prepare_for_inference(input_shapes=self.config.input_shapes)
 
         LOGGER.info("\t+ Warming up the forward pass")
-        for _ in range(self.warmup_runs):
-            _ = backend.forward(forward_input)
+        for _ in range(self.config.warmup_runs):
+            _ = backend.forward(forward_input, **self.config.forward_kwargs)
 
         LOGGER.info("\t+ Tracking forward pass latency and throughput")
         latency_tracker = latency_tracker_class_for_backend[backend.config.name](
             backend
         )
-        while sum(self.forward_latencies) < self.benchmark_duration:
+        while sum(self.forward_latencies) < self.config.duration:
             with latency_tracker.track():
-                _ = backend.forward(forward_input)
+                _ = backend.forward(forward_input, **self.config.forward_kwargs)
             self.forward_latencies = latency_tracker.get_latencies()
 
         LOGGER.info(f"\t+ Forward pass latency: {self.forward_latency:.2e} (s)")
@@ -150,7 +198,6 @@ def run_generate_tracking(self, backend: Backend) -> None:
             mode="forward",
         )
 
-        # TODO: handle this in backend using prepare_for_inference
         for key, value in generate_input.items():
             if key == "prompt":
                 continue
@@ -159,28 +206,18 @@ def run_generate_tracking(self, backend: Backend) -> None:
         LOGGER.info("\t+ Warming up the generation pass")
         _ = backend.generate(
             input=generate_input,
-            max_new_tokens=self.new_tokens,
-            min_new_tokens=self.new_tokens,
-            do_sample=False,
-            use_cache=True,
-            pad_token_id=0,
-            num_beams=1,
+            **self.config.generate_kwargs,
         )
 
         LOGGER.info("\t+ Tracking generation latency and throughput")
         latency_tracker = latency_tracker_class_for_backend[backend.config.name](
             backend
         )
-        while sum(self.generate_latencies) < self.benchmark_duration:
+        while sum(self.generate_latencies) < self.config.duration:
             with latency_tracker.track():
                 _ = backend.generate(
                     generate_input,
-                    max_new_tokens=self.new_tokens,
-                    min_new_tokens=self.new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                    pad_token_id=0,
-                    num_beams=1,
+                    **self.config.generate_kwargs,
                 )
             self.generate_latencies = latency_tracker.get_latencies()
 
@@ -192,33 +229,45 @@ def run_generate_tracking(self, backend: Backend) -> None:
 
     # Metrics
     @property
+    @three_sig_figs
     def forward_latency(self) -> float:
-        return significant_figures(statistics.mean(self.forward_latencies))
+        return statistics.mean(self.forward_latencies)
 
     @property
+    @three_sig_figs
     def forward_throughput(self) -> float:
-        return significant_figures(self.input_shapes.batch_size / self.forward_latency)
+        return (
+            self.config.input_shapes["batch_size"]
+            * self.config.forward_kwargs["num_images_per_prompt"]
+            / self.forward_latency
+            if self.config.can_diffuse
+            else self.config.input_shapes["batch_size"] / self.forward_latency
+        )
 
     @property
+    @three_sig_figs
     def generate_latency(self) -> float:
-        return significant_figures(statistics.mean(self.generate_latencies))
+        return statistics.mean(self.generate_latencies)
 
     @property
+    @three_sig_figs
     def generate_throughput(self) -> float:
-        return significant_figures(
-            self.new_tokens * self.input_shapes.batch_size / self.generate_latency
+        return (
+            self.config.generate_kwargs["min_new_tokens"]
+            * self.config.input_shapes["batch_size"]
+            / self.generate_latency
         )
 
     def get_results_df(self) -> DataFrame:
         results_dict = dict()
 
-        if self.memory:
+        if self.config.memory:
             results_dict["forward.peak_memory(MB)"] = self.forward_peak_memory
 
         results_dict["forward.latency(s)"] = self.forward_latency
         results_dict["forward.throughput(samples/s)"] = self.forward_throughput
 
-        if self.can_generate:
+        if self.config.can_generate:
             results_dict["generate.latency(s)"] = self.generate_latency
             results_dict["generate.throughput(tokens/s)"] = self.generate_throughput
 
@@ -228,7 +277,3 @@ def save(self) -> None:
         LOGGER.info("Saving inference results")
         results_df = self.get_results_df()
         results_df.to_csv("inference_results.csv")
-
-
-def significant_figures(x):
-    return float(f"{x:.3g}")
diff --git a/optimum_benchmark/benchmarks/inference_utils.py b/optimum_benchmark/benchmarks/inference_utils.py
new file mode 100644
index 000000000..b2280cdc3
--- /dev/null
+++ b/optimum_benchmark/benchmarks/inference_utils.py
@@ -0,0 +1,37 @@
+DEFAULT_GENERATE_KWARGS = {
+    "max_new_tokens": 100,
+    "min_new_tokens": 100,
+    "do_sample": False,
+    "use_cache": True,
+    "pad_token_id": 0,
+    "num_beams": 1,
+}
+
+DEFAULT_DIFUSION_KWARGS = {
+    "num_images_per_prompt": 1,
+}
+
+DEFAULT_INPUT_SHAPES = {
+    # used with all tasks
+    "batch_size": 2,
+    # used with text input tasks
+    "sequence_length": 16,
+    # used with multiple choice tasks where input
+    # is of shape (batch_size, num_choices, sequence_length)
+    "num_choices": 1,
+    # used with audio input tasks
+    "feature_size": 80,
+    "nb_max_frames": 3000,
+    "audio_sequence_length": 16000,
+}
+
+
+def format_float(x: float) -> float:
+    return float(f"{x:.3g}")
+
+
+def three_sig_figs(func):
+    def wrapper(*args, **kwargs):
+        return format_float(func(*args, **kwargs))
+
+    return wrapper
diff --git a/optimum_benchmark/benchmarks/training.py b/optimum_benchmark/benchmarks/training.py
index ce0c472fb..6ba1ab20b 100644
--- a/optimum_benchmark/benchmarks/training.py
+++ b/optimum_benchmark/benchmarks/training.py
@@ -1,24 +1,20 @@
-from typing import Any, Optional, Dict, TYPE_CHECKING
+from typing import Any, Dict
 from dataclasses import dataclass, field
 from logging import getLogger
 
-from transformers import default_data_collator
 from omegaconf import OmegaConf
 from pandas import DataFrame
-import torch
 
-from optimum_benchmark.benchmarks.base import Benchmark, BenchmarkConfig
-from optimum_benchmark.generators.dataset_generator import DatasetGenerator
-
-if TYPE_CHECKING:
-    from optimum_benchmark.backends.base import Backend
+from ..backends.base import Backend
+from .base import Benchmark, BenchmarkConfig
+from ..generators.dataset_generator import DatasetGenerator
+from .training_utils import MeasurementCallback, get_data_collator
 
 
 LOGGER = getLogger("training")
 
 # resolvers
 OmegaConf.register_new_resolver("is_cpu", lambda device: device == "cpu")
-OmegaConf.register_new_resolver("device_count", lambda: torch.cuda.device_count())
 
 
 @dataclass
@@ -26,6 +22,9 @@ class TrainingConfig(BenchmarkConfig):
     name: str = "training"
     _target_: str = "optimum_benchmark.benchmarks.training.TrainingBenchmark"
 
+    # training options
+    warmup_steps: int = 2
+
     # dataset options
     dataset_shapes: Dict = field(
         default_factory=lambda: {
@@ -46,146 +45,23 @@ class TrainingConfig(BenchmarkConfig):
     # training options
     training_arguments: Dict = field(
         default_factory=lambda: {
-            "output_dir": "./trainer_output",
+            # these are arguments that we set by default
+            # but can be overwritten by the user
             "skip_memory_metrics": False,
+            "output_dir": "./trainer_output",
             "use_cpu": "${is_cpu:${device}}",
+            "ddp_find_unused_parameters": False,
             "do_train": True,
             "do_eval": False,
             "do_predict": False,
-            # add any other training arguments in your config
-            ###### TrainingArguments ########
-            # prediction_loss_only: bool = False,
-            # per_device_train_batch_size: int = 8,
-            # per_gpu_train_batch_size: int | None = None,
-            # gradient_accumulation_steps: int = 1,
-            # learning_rate: float = 0.00005,
-            # weight_decay: float = 0,
-            # adam_beta1: float = 0.9,
-            # adam_beta2: float = 0.999,
-            # adam_epsilon: float = 1e-8,
-            # max_grad_norm: float = 1,
-            # num_train_epochs: float = 3,
-            # max_steps: int = -1,
-            # lr_scheduler_type: SchedulerType | str = "linear",
-            # warmup_ratio: float = 0,
-            # warmup_steps: int = 0,
-            # log_level: str | None = "passive",
-            # log_level_replica: str | None = "warning",
-            # log_on_each_node: bool = True,
-            # logging_dir: str | None = None,
-            # logging_strategy: IntervalStrategy | str = "steps",
-            # logging_first_step: bool = False,
-            # logging_steps: float = 500,
-            # logging_nan_inf_filter: bool = True,
-            # save_strategy: IntervalStrategy | str = "steps",
-            # save_steps: float = 500,
-            # save_total_limit: int | None = None,
-            # save_safetensors: bool | None = False,
-            # save_on_each_node: bool = False,
-            # use_mps_device: bool = False,
-            # seed: int = 42,
-            # data_seed: int | None = None,
-            # jit_mode_eval: bool = False,
-            # use_ipex: bool = False,
-            # bf16: bool = False,
-            # fp16: bool = False,
-            # fp16_opt_level: str = "O1",
-            # half_precision_backend: str = "auto",
-            # bf16_full_eval: bool = False,
-            # fp16_full_eval: bool = False,
-            # tf32: bool | None = None,
-            # local_rank: int = -1,
-            # ddp_backend: str | None = None,
-            # tpu_num_cores: int | None = None,
-            # tpu_metrics_debug: bool = False,
-            # debug: str | List[DebugOption] = "",
-            # dataloader_drop_last: bool = False,
-            # eval_steps: float | None = None,
-            # dataloader_num_workers: int = 0,
-            # past_index: int = -1,
-            # run_name: str | None = None,
-            # disable_tqdm: bool | None = None,
-            # remove_unused_columns: bool | None = True,
-            # label_names: List[str] | None = None,
-            # load_best_model_at_end: bool | None = False,
-            # metric_for_best_model: str | None = None,
-            # greater_is_better: bool | None = None,
-            # ignore_data_skip: bool = False,
-            # sharded_ddp: List[ShardedDDPOption] | str | None = "",
-            # fsdp: List[FSDPOption] | str | None = "",
-            # fsdp_min_num_params: int = 0,
-            # fsdp_config: str | None = None,
-            # fsdp_transformer_layer_cls_to_wrap: str | None = None,
-            # deepspeed: str | None = None,
-            # label_smoothing_factor: float = 0,
-            # optim: OptimizerNames | str = default_optim,
-            # optim_args: str | None = None,
-            # adafactor: bool = False,
-            # group_by_length: bool = False,
-            # length_column_name: str | None = "length",
-            # report_to: List[str] | None = None,
-            # ddp_find_unused_parameters: bool | None = None,
-            # ddp_bucket_cap_mb: int | None = None,
-            # ddp_broadcast_buffers: bool | None = None,
-            # dataloader_pin_memory: bool = True,
-            # use_legacy_prediction_loop: bool = False,
-            # push_to_hub: bool = False,
-            # resume_from_checkpoint: str | None = None,
-            # hub_model_id: str | None = None,
-            # hub_strategy: HubStrategy | str = "every_save",
-            # hub_token: str | None = None,
-            # hub_private_repo: bool = False,
-            # gradient_checkpointing: bool = False,
-            # include_inputs_for_metrics: bool = False,
-            # fp16_backend: str = "auto",
-            # push_to_hub_model_id: str | None = None,
-            # push_to_hub_organization: str | None = None,
-            # push_to_hub_token: str | None = None,
-            # mp_parameters: str = "",
-            # auto_find_batch_size: bool = False,
-            # full_determinism: bool = False,
-            # torchdynamo: str | None = None,
-            # ray_scope: str | None = "last",
-            # ddp_timeout: int | None = 1800,
-            # torch_compile: bool = False,
-            # torch_compile_backend: str | None = None,
-            # torch_compile_mode: str | None = None,
-            # dispatch_batches: bool | None = None
         }
     )
 
-    # PyTorch-specific configuration.
-    use_ddp: bool = False
-    ddp_config: Optional[Dict] = None
-
-    def __post_init__(self):
-        if self.use_ddp:
-            # Copied from https://github.com/pytorch/pytorch/blob/v2.0.0/torch/distributed/launcher/api.py#L29, adjusting to the defaults of torch.distributed.run
-            ddp_config = {
-                "min_nodes": 1,
-                "max_nodes": 1,
-                "nproc_per_node": "${device_count:}",
-                "run_id": "none",
-                "role": "default",
-                "rdzv_endpoint": "127.0.0.1:29500",
-                "rdzv_backend": "static",
-                "rdzv_configs": {"timeout": 900, "rank": 0},
-                "max_restarts": 0,
-                "monitor_interval": 5,
-                # For the arguments below, the CLI torch.distributed.run matches with LaunchConfig defaults.
-                # start_method: str = "spawn"
-                # log_dir: Optional[str] = None
-                # redirects: Std = Std.NONE
-                # tee: Std = Std.NONE
-                # metrics_cfg: Dict[str, str] = field(default_factory=dict)
-                # local_addr: Optional[str] = None
-            }
-            if self.ddp_config is not None:
-                ddp_config.update(self.ddp_config)
-            self.ddp_config = ddp_config
-
 
 class TrainingBenchmark(Benchmark):
+    name: str = "training"
+    config: TrainingConfig
+
     def __init__(self):
         # initialize training results
         self.training_metrics: Dict[str, Any] = {}
@@ -193,46 +69,34 @@ def __init__(self):
     def configure(self, config: TrainingConfig):
         super().configure(config)
 
-        self.dataset_shapes = config.dataset_shapes
-        self.training_arguments = config.training_arguments
-
     def run(self, backend: "Backend") -> None:
         LOGGER.info("Running training benchmark")
-        model_shapes = backend.model_shapes
-        self.dataset_shapes = {**self.dataset_shapes, **model_shapes}
-
-        self.dataset_generator = DatasetGenerator(
-            task=backend.task,
-            dataset_shapes=self.dataset_shapes,
+        task = backend.task
+        dataset_shapes = {**self.config.dataset_shapes, **backend.model_shapes}
+        dataset_generator = DatasetGenerator(task=task, dataset_shapes=dataset_shapes)
+
+        training_dataset = dataset_generator.generate()
+        training_data_collator = get_data_collator(task=task)
+        training_callbacks = [MeasurementCallback(self.config.warmup_steps)]
+
+        trainer_state = backend.train(
+            training_dataset=training_dataset,
+            training_callbacks=training_callbacks,
+            training_data_collator=training_data_collator,
+            training_arguments=self.config.training_arguments,
         )
 
-        training_dataset = self.dataset_generator.generate()
-
-        training_data_collator = get_data_collator(
-            task=backend.task,
-        )
-
-        if backend.config.name == "pytorch":
-            self.training_metrics = backend.run_pytorch_training(
-                training_config=self.config,
-                training_arguments=self.training_arguments,
-                training_dataset=training_dataset,
-                training_data_collator=training_data_collator,
-            )
-        else:
-            backend.prepare_for_training(
-                training_dataset=training_dataset,
-                training_data_collator=training_data_collator,
-                training_arguments=self.training_arguments,
-            )
-            training_output = backend.train()
-
-            self.training_metrics = {
-                "training_throughput": training_output.metrics[
-                    "train_samples_per_second"
-                ],
-                "train_runtime": training_output.metrics["train_runtime"],
-            }
+        self.training_metrics = {
+            # warmup metrics
+            "warmup_runtime": trainer_state.warmup_runtime,
+            "warmup_throughput()": trainer_state.warmup_samples_per_second,
+            # training metrics
+            "train_runtime": trainer_state.train_runtime,
+            "training_throughput": trainer_state.train_samples_per_second,
+            # overall training metrics
+            "overall_train_runtime": trainer_state.overall_train_runtime,
+            "overall_training_throughput": trainer_state.overall_train_samples_per_second,
+        }
 
     def get_results_df(self) -> DataFrame:
         return DataFrame(self.training_metrics, index=[0])
@@ -241,19 +105,3 @@ def save(self) -> None:
         LOGGER.info("Saving training results")
         results_df = self.get_results_df()
         results_df.to_csv("training_results.csv")
-
-
-def get_data_collator(task: str) -> callable:
-    if task == "object-detection":
-        return object_detection_data_collator
-    else:
-        return default_data_collator
-
-
-def object_detection_data_collator(batch) -> Dict[str, torch.Tensor]:
-    pixel_values = torch.stack([example["pixel_values"] for example in batch])
-    labels = [example["labels"] for example in batch]
-    return {
-        "pixel_values": pixel_values,
-        "labels": labels,
-    }
diff --git a/optimum_benchmark/benchmarks/training_utils.py b/optimum_benchmark/benchmarks/training_utils.py
new file mode 100644
index 000000000..097e06c22
--- /dev/null
+++ b/optimum_benchmark/benchmarks/training_utils.py
@@ -0,0 +1,103 @@
+from typing import Any, Dict, TYPE_CHECKING
+from dataclasses import dataclass
+import time
+
+from transformers import default_data_collator
+from transformers import TrainerCallback
+
+if TYPE_CHECKING:
+    from transformers import TrainerState, TrainingArguments, TrainerControl
+
+
+@dataclass
+class MeasurementCallback(TrainerCallback):
+    warmup_steps: int
+
+    def on_train_begin(
+        self,
+        args: "TrainingArguments",
+        state: "TrainerState",
+        control: "TrainerControl",
+        **kwargs,
+    ):
+        if state.max_steps <= self.warmup_steps:
+            # This check is here because max_steps is set only once the training
+            # is launched, thus we can not check before calling trainer.train().
+            raise ValueError(
+                f"Total training steps {state.max_steps} is smaller "
+                "than the number of warmup steps {self.warmup_steps}. "
+                "Please increase the total number of steps (for example by "
+                "increasing the dataset size)."
+            )
+
+        state.warmup_start = time.time_ns() * 1e-9
+        state.overall_train_start = time.time_ns() * 1e-9
+
+    def on_step_begin(
+        self,
+        args: "TrainingArguments",
+        state: "TrainerState",
+        control: "TrainerControl",
+        **kwargs,
+    ):
+        if state.global_step == self.warmup_steps:
+            state.warmup_end = time.time_ns() * 1e-9
+            state.training_start = time.time_ns() * 1e-9
+        elif state.global_step > state.max_steps - 1:
+            raise ValueError("global_step > state.max_steps - 1")
+
+    def on_train_end(
+        self,
+        args: "TrainingArguments",
+        state: "TrainerState",
+        control: "TrainerControl",
+        **kwargs,
+    ):
+        state.training_end = time.time_ns() * 1e-9
+        state.overall_train_end = time.time_ns() * 1e-9
+
+        state.total_train_batch_size = (
+            args.train_batch_size * args.gradient_accumulation_steps * args.world_size
+        )
+
+        # warmup metrics
+        state.warmup_runtime = state.warmup_end - state.warmup_start
+        state.num_warmup_samples = self.warmup_steps * state.total_train_batch_size
+        state.warmup_samples_per_second = (
+            state.num_warmup_samples / state.warmup_runtime
+        )
+        # state.warmup_steps_per_second = self.warmup_steps / state.warmup_runtime
+
+        # training metrics
+        state.train_runtime = state.training_end - state.training_start
+        state.num_train_steps = state.max_steps - self.warmup_steps
+        state.num_train_samples = state.num_train_steps * state.total_train_batch_size
+        state.train_samples_per_second = state.num_train_samples / state.train_runtime
+        # state.train_steps_per_second = state.num_train_steps / state.train_runtime
+
+        # overall training metrics
+        state.overall_train_runtime = state.training_end - state.warmup_start
+        state.overall_train_samples_per_second = (
+            state.num_train_samples / state.overall_train_runtime
+        )
+        # state.overall_train_steps_per_second = (
+        #     state.num_train_steps / state.overall_train_runtime
+        # )
+
+
+def get_data_collator(task: str) -> callable:
+    if task == "object-detection":
+        return object_detection_data_collator
+    else:
+        return default_data_collator
+
+
+def object_detection_data_collator(batch) -> Dict[str, Any]:
+    import torch
+
+    pixel_values = torch.stack([example["pixel_values"] for example in batch])
+    labels = [example["labels"] for example in batch]
+    return {
+        "pixel_values": pixel_values,
+        "labels": labels,
+    }
diff --git a/optimum_benchmark/generators/dataset_generator.py b/optimum_benchmark/generators/dataset_generator.py
index 56bcbfc6f..0d5f00e68 100644
--- a/optimum_benchmark/generators/dataset_generator.py
+++ b/optimum_benchmark/generators/dataset_generator.py
@@ -9,7 +9,7 @@
 )
 
 
-LOGGER = getLogger("dummy_dataset")
+LOGGER = getLogger("dataset_generator")
 
 
 class DatasetGenerator:
@@ -23,7 +23,7 @@ def __init__(
         dataset_shapes["batch_size"] = dataset_shapes.pop("dataset_size")
 
         if task in TASKS_TO_GENERATORS:
-            LOGGER.info(f"Using {TASKS_TO_GENERATORS[task]} generator")
+            LOGGER.info(f"Using {task} task generator")
             self.task_generator = TASKS_TO_GENERATORS[task](
                 shapes=dataset_shapes,
                 with_labels=True,
@@ -32,16 +32,15 @@ def __init__(
             raise NotImplementedError(
                 f"Task {task} is supported. \n"
                 f"Available tasks: {list(TASKS_TO_GENERATORS.keys())}. \n"
-                "If you want to add support for this task, please submit a PR or a feature request to optimum-benchmark. \n"
+                "If you want to add support for this task, "
+                "please submit a PR or a feature request to optimum-benchmark. \n"
             )
 
     def generate(self) -> Dataset:
         task_dataset = self.task_generator.generate()
-
-        # TODO: we can move this to backend.prepare_for_training to avoid the torch dependency
         task_dataset = Dataset.from_dict(task_dataset)
         task_dataset.set_format(
-            type="torch",
+            type="numpy",
             columns=list(task_dataset.features.keys()),
         )
 
diff --git a/optimum_benchmark/generators/input_generator.py b/optimum_benchmark/generators/input_generator.py
index e27f3a494..f384abb23 100644
--- a/optimum_benchmark/generators/input_generator.py
+++ b/optimum_benchmark/generators/input_generator.py
@@ -15,7 +15,7 @@
 )
 
 
-LOGGER = getLogger("dummy_dataset")
+LOGGER = getLogger("input_generator")
 
 
 class InputGenerator:
@@ -49,9 +49,11 @@ def __init__(
             raise NotImplementedError(
                 f"Neither task {task} nor model type {model_type} is supported. \n"
                 f"Available tasks: {list(TASKS_TO_GENERATORS.keys())}. \n"
-                "If you want to add support for this task, please submit a PR or a feature request to optimum-benchmark. \n"
+                "If you want to add support for this task, "
+                "please submit a PR or a feature request to optimum-benchmark. \n"
                 f"Available model types: {SUPPURTED_MODEL_TYPES}. \n"
-                "If you want to add support for this model type, please submit a PR or a feature request to optimum."
+                "If you want to add support for this model type, "
+                "please submit a PR or a feature request to optimum."
             )
 
     # TODO: we can drop the torch dependency here by returning a dict of numpy arrays
diff --git a/optimum_benchmark/import_utils.py b/optimum_benchmark/import_utils.py
index 23176229c..cc97c5397 100644
--- a/optimum_benchmark/import_utils.py
+++ b/optimum_benchmark/import_utils.py
@@ -3,16 +3,22 @@
 _torch_available = importlib.util.find_spec("torch") is not None
 _onnxruntime_available = importlib.util.find_spec("onnxruntime") is not None
 _is_openvino_available = importlib.util.find_spec("openvino") is not None
-_is_neural_compressor_available = importlib.util.find_spec("neural_compressor") is not None
+_is_neural_compressor_available = (
+    importlib.util.find_spec("neural_compressor") is not None
+)
+
 
 def is_torch_available():
     return _torch_available
 
+
 def is_onnxruntime_available():
     return _onnxruntime_available
 
+
 def is_openvino_available():
     return _is_openvino_available
 
+
 def is_neural_compressor_available():
-    return _is_neural_compressor_available
\ No newline at end of file
+    return _is_neural_compressor_available
diff --git a/optimum_benchmark/preprocessors/glue.py b/optimum_benchmark/preprocessors/glue.py
index ef18158eb..8e7413593 100644
--- a/optimum_benchmark/preprocessors/glue.py
+++ b/optimum_benchmark/preprocessors/glue.py
@@ -9,6 +9,5 @@ def __call__(self, examples):
         return self.tokenizer(
             examples["sentence"],
             padding="max_length",
-            max_length=128,
             truncation=True,
         )
diff --git a/optimum_benchmark/report.py b/optimum_benchmark/report.py
index de31b0b7e..9e12d299e 100644
--- a/optimum_benchmark/report.py
+++ b/optimum_benchmark/report.py
@@ -16,7 +16,7 @@ def gather_inference_report(root_folder: Path) -> DataFrame:
     # key is path to inference file as string, value is dataframe
     inference_dfs = {
         f.parent.absolute().as_posix(): pd.read_csv(f)
-        for f in root_folder.glob(f"**/inference_results.csv")
+        for f in root_folder.glob("**/inference_results.csv")
     }
 
     # key is path to config file as string, value is flattened dict
@@ -26,7 +26,7 @@ def gather_inference_report(root_folder: Path) -> DataFrame:
             flatten(OmegaConf.load(f), reducer="dot"), orient="index"
         )
         .T
-        for f in root_folder.glob(f"**/hydra_config.yaml")
+        for f in root_folder.glob("**/hydra_config.yaml")
         if f.parent.absolute().as_posix() in inference_dfs.keys()
     }
 
@@ -53,7 +53,7 @@ def style_element(element, style=""):
 
 
 def format_element(element, style=""):
-    if type(element) == float:
+    if isinstance(element, float):
         if element != element:  # nan
             formated_element = ""
         elif abs(element) >= 1:
@@ -64,7 +64,7 @@ def format_element(element, style=""):
             formated_element = f"{element}"
     elif element is None:
         formated_element = ""
-    elif type(element) == bool:
+    elif isinstance(element, bool):
         if element:
             formated_element = style_element("✔", style="green")
         else:
@@ -295,7 +295,7 @@ def generate_report():
     # create reporting directory and title using the filters
     if report_name is None:
         report_name = "Inference Report"
-        reporting_directory = f"reports/inferece_report"
+        reporting_directory = "reports/inferece_report"
     else:
         reporting_directory = f"reports/{report_name}"
 
diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py
index 0c2e9812a..98dc93067 100644
--- a/optimum_benchmark/trackers/latency.py
+++ b/optimum_benchmark/trackers/latency.py
@@ -47,6 +47,7 @@ def _cpu_latency(self):
         LOGGER.debug(f"Tracked CPU latency: {latency:.2e}s")
         self.latencies.append(latency)
 
+
 class PyTorchLatencyTracker(LatencyTracker):
     def __init__(self, backend):
         super().__init__(backend)
@@ -58,7 +59,9 @@ def __init__(self, backend):
             self.hf_device_map = None
             self.end_device = self.device
             if self.device.type == "cuda":
-                self.device_indexes = {self.device.index if self.device.index is not None else 0}
+                self.device_indexes = {
+                    self.device.index if self.device.index is not None else 0
+                }
 
     def _cuda_latency(self):
         start_event = torch.cuda.Event(enable_timing=True)
diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py
index 3cba23ca9..c126321a6 100644
--- a/optimum_benchmark/trackers/memory.py
+++ b/optimum_benchmark/trackers/memory.py
@@ -40,7 +40,8 @@ def _track_cuda_peak_memory(self):
         meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
         nvml.nvmlShutdown()
 
-        # At least for PyTorch, relying on meminfo.used is fine here as PyTorch does not deallocate its cache after running forward.
+        # At least for PyTorch, relying on meminfo.used is fine 
+        # here as PyTorch does not deallocate its cache after running forward.
         self.peak_memory = max(self.peak_memory, meminfo.used)
         LOGGER.debug(f"Peak memory usage: {self.get_peak_memory()} MB")
 
@@ -89,6 +90,7 @@ def run(self):
         self.connection.send(self.mem_usage)
         self.connection.close()
 
+
 class PyTorchMemoryTracker(MemoryTracker):
     def __init__(self, backend):
         super().__init__(backend)
@@ -97,7 +99,9 @@ def __init__(self, backend):
             self.hf_device_map = backend.pretrained_model.hf_device_map
             self.device_indexes = set(self.hf_device_map.values())
         else:
-            self.device_indexes = {self.device.index if self.device.index is not None else 0}
+            self.device_indexes = {
+                self.device.index if self.device.index is not None else 0
+            }
 
         # This variable is used only when CUDA device is used.
         self.peak_per_device = [0 for _ in range(len(self.device_indexes))]
@@ -116,7 +120,7 @@ def _track_cuda_peak_memory(self):
             meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
 
             self.peak_per_device[i] = max(self.peak_per_device[i], meminfo.used)
-        
+
         for i, peak_device in enumerate(self.peak_per_device):
             LOGGER.debug(f"Peak memory {i} usage: {peak_device * 1e-6} MB")
 
diff --git a/optimum_benchmark/utils.py b/optimum_benchmark/utils.py
index 16822ef5b..001c2f38f 100644
--- a/optimum_benchmark/utils.py
+++ b/optimum_benchmark/utils.py
@@ -1,7 +1,6 @@
 from typing import Optional, List
 from logging import getLogger
 import subprocess
-import importlib
 import platform
 import random
 import signal
@@ -9,26 +8,17 @@
 import re
 import os
 
-from omegaconf import DictConfig
 import numpy as np
 import psutil
-import torch
 
 LOGGER = getLogger("utils")
 
 
 def set_seed(seed: int) -> None:
-    # TODO: Should be devided into multiple functions
-    # each setting seeds for a backend
     random.seed(seed)
     np.random.seed(seed)
     os.environ["PYTHONHASHSEED"] = str(seed)
 
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-    torch.backends.cudnn.deterministic = True
-    torch.backends.cudnn.benchmark = False
-
 
 def bytes_to_mega_bytes(bytes: int) -> int:
     # Reference: https://en.wikipedia.org/wiki/Byte#Multiple-byte_units
@@ -86,8 +76,8 @@ def check_no_process_is_running_on_cuda_device(device_ids: List[int]) -> None:
                 if subprocess.check_output(
                     [
                         "nvidia-smi",
-                        f"--query-compute-apps=pid,used_memory",
-                        f"--format=csv,noheader,nounits",
+                        "--query-compute-apps=pid,used_memory",
+                        "--format=csv,noheader,nounits",
                         f"--id={device_id}",
                     ]
                 )
@@ -96,7 +86,8 @@ def check_no_process_is_running_on_cuda_device(device_ids: List[int]) -> None:
             ]
         )
 
-        # TODO: It would be safer to run each run of a sweep in a subprocess. Although we can trust PyTorch to clear GPU memory when asked,
+        # TODO: It would be safer to run each run of a sweep in a subprocess.
+        # Although we can trust PyTorch to clear GPU memory when asked,
         # it is not a safe assumption to make for all backends.
         if len(pids_on_device_id) > 1 or (
             len(pids_on_device_id) == 1 and os.getpid() not in pids_on_device_id
@@ -138,8 +129,8 @@ def check_only_this_process_is_running_on_cuda_device(
                     if subprocess.check_output(
                         [
                             "nvidia-smi",
-                            f"--query-compute-apps=pid,used_memory",
-                            f"--format=csv,noheader,nounits",
+                            "--query-compute-apps=pid,used_memory",
+                            "--format=csv,noheader,nounits",
                             f"--id={device_id}",
                         ]
                     )
@@ -148,7 +139,8 @@ def check_only_this_process_is_running_on_cuda_device(
                 ]
             )
 
-            # check if there is a process running on device_id that is not the current process
+            # check if there is a process running on
+            # device_id that is not the current process
             if len(pids_on_device_id) > 1:
                 os.kill(pid, signal.SIGTERM)
                 raise RuntimeError(
@@ -161,58 +153,6 @@ def check_only_this_process_is_running_on_cuda_device(
         time.sleep(1)
 
 
-# TODO: move this to onnxruntime backend, the only place using it
-def infer_device_id(device: str) -> int:
-    """
-    Infer the device id from the given device string.
-    """
-
-    if device == "cuda":
-        return torch.cuda.current_device()
-    elif torch.device(device).type == "cuda":
-        return torch.device(device).index
-    elif torch.device(device).type == "cpu":
-        return -1
-    else:
-        raise ValueError(f"Unknown device '{device}'")
-
-
-_NAME_TO_IMPORTPATH = {
-    "pytorch": "optimum_benchmark.backends.pytorch",
-    "openvino": "optimum_benchmark.backends.openvino",
-    "neural_compressor": "optimum_benchmark.backends.neural_compressor",
-    "onnxruntime": "optimum_benchmark.backends.onnxruntime",
-    "inference": "optimum_benchmark.benchmarks.inference",
-    "training": "optimum_benchmark.benchmarks.training",
-}
-
-_NAME_TO_CLASS_NAME = {
-    "pytorch": "PyTorchConfig",
-    "openvino": "OVConfig",
-    "neural_compressor": "INCConfig",
-    "onnxruntime": "ORTConfig",
-    "inference": "InferenceConfig",
-    "training": "TrainingConfig",
-}
-
-
-def name_to_dataclass(name: str):
-    # We use a map name to import path to avoid importing everything here, especially every backend, to avoid to install all backends to run
-    # optimum-benchmark.
-    module = importlib.import_module(_NAME_TO_IMPORTPATH[name])
-    dataclass_class = getattr(module, _NAME_TO_CLASS_NAME[name])
-    return dataclass_class
-
-
-def remap_to_correct_metadata(experiment: DictConfig):
-    for key, value in experiment.items():
-        if isinstance(value, DictConfig) and hasattr(value, "name"):
-            experiment[key]._metadata.object_type = name_to_dataclass(
-                experiment[key].name
-            )
-    return experiment
-
-
 DIFFUSION_TASKS = [
     "stable-diffusion",
     "stable-diffusion-xl",
diff --git a/tests/configs/base_config.yaml b/tests/configs/base_config.yaml
index f691dc7c7..517f15c8b 100644
--- a/tests/configs/base_config.yaml
+++ b/tests/configs/base_config.yaml
@@ -6,7 +6,6 @@ defaults:
   - _self_ # for hydra 1.1 compatibility
   - override hydra/job_logging: colorlog # colorful logging
   - override hydra/hydra_logging: colorlog # colorful logging
-  - override hydra/launcher: joblib
 
 # hydra behavior configuration
 hydra:
@@ -18,14 +17,6 @@ hydra:
     # we change the working directory during the run/sweep directory
     # this is useful for saving outputs in a separate directory
     chdir: true
-  launcher:
-    # we set the number of jobs to 2 since when using 1, joblib reuses the same process
-    n_jobs: 2
-    prefer: processes
-    backend: multiprocessing
-  sweeper:
-    # now we force the sweeper to run one job at a time, achieving almost perfect isolation
-    max_batch_size: 1
 
 backend:
   initial_isolation_check: false
diff --git a/tests/configs/distributed_cuda_pytorch_training_bert_ddp.yaml b/tests/configs/distributed_cuda_pytorch_training_bert_ddp.yaml
index cab23bb43..2db9b8661 100644
--- a/tests/configs/distributed_cuda_pytorch_training_bert_ddp.yaml
+++ b/tests/configs/distributed_cuda_pytorch_training_bert_ddp.yaml
@@ -4,15 +4,16 @@ defaults:
   - override benchmark: training
 
 experiment_name: distributed_cuda_pytorch_training_bert_ddp
-
-model: bert-base-uncased
 task: text-classification
+model: bert-base-uncased
 device: cuda
 
-benchmark:
+backend:
   use_ddp: true
+
+benchmark:
   dataset_shapes:
-    dataset_size: 1200
+    dataset_size: 120
     sequence_length: 256
   training_arguments:
     per_device_train_batch_size: 32
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 1ece9daf7..f9d4d39f7 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -32,7 +32,6 @@ def test_single_device_runs(config_file):
             "tests/configs",
             "--config-name",
             config_name,
-            # "--multirun", # usefull for isolation but makes debugging harder
         ],
         capture_output=True,
     )
@@ -55,7 +54,6 @@ def test_distributed_runs(config_file):
             "tests/configs",
             "--config-name",
             config_name,
-            # "--multirun", # usefull for isolation but makes debugging harder
         ],
         capture_output=True,
         env=my_env,

From ea45d9267617456760178cfaefcc60f326d1556c Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 28 Aug 2023 04:41:37 +0200
Subject: [PATCH 4/8] now hydra can set cuda visible devices + better typing

---
 .gitignore                                    |   1 +
 optimum_benchmark/backends/base.py            | 205 +++----
 .../backends/neural_compressor.py             | 195 -------
 .../{utils => neural_compressor}/__init__.py  |   0
 .../backends/neural_compressor/backend.py     | 107 ++++
 .../backends/neural_compressor/config.py      |  88 +++
 .../backends/neural_compressor/utils.py       |   5 +
 optimum_benchmark/backends/onnxruntime.py     | 505 ------------------
 .../backends/onnxruntime/__init__.py          |   0
 .../backends/onnxruntime/backend.py           | 332 ++++++++++++
 .../backends/onnxruntime/config.py            | 185 +++++++
 .../backends/onnxruntime/utils.py             |  40 ++
 optimum_benchmark/backends/openvino.py        | 190 -------
 .../backends/openvino/__init__.py             |   0
 .../backends/openvino/backend.py              | 119 +++++
 optimum_benchmark/backends/openvino/config.py |  64 +++
 optimum_benchmark/backends/openvino/utils.py  |   3 +
 .../backends/{utils => }/optimum_utils.py     | 232 ++++----
 optimum_benchmark/backends/pytorch.py         | 451 ----------------
 .../backends/pytorch/__init__.py              |   0
 optimum_benchmark/backends/pytorch/backned.py | 265 +++++++++
 optimum_benchmark/backends/pytorch/config.py  | 143 +++++
 optimum_benchmark/backends/pytorch/utils.py   |  35 ++
 optimum_benchmark/backends/utils.py           | 176 ++++++
 .../backends/utils/base_utils.py              |  92 ----
 .../backends/utils/neural_compressor_utils.py |  39 --
 .../backends/utils/onnxruntime_utils.py       |  94 ----
 .../backends/utils/openvino_utils.py          |  14 -
 .../backends/utils/pytorch_utils.py           |  78 ---
 optimum_benchmark/benchmarks/base.py          |  18 +-
 optimum_benchmark/benchmarks/inference.py     | 192 +++----
 .../benchmarks/inference_utils.py             |  37 --
 optimum_benchmark/benchmarks/training.py      |  29 +-
 .../benchmarks/training_utils.py              | 103 ----
 optimum_benchmark/benchmarks/utils.py         |  87 +++
 optimum_benchmark/env_utils.py                |  38 ++
 optimum_benchmark/experiment.py               |  83 ++-
 .../generators/dataset_generator.py           |   9 +-
 .../generators/input_generator.py             |  50 +-
 .../generators/model_type_generator.py        |  33 +-
 .../generators/task_generator.py              |  27 +-
 optimum_benchmark/import_utils.py             |   4 +-
 optimum_benchmark/profilers/fx_profiler.py    |  13 +-
 optimum_benchmark/profilers/ort_profiler.py   |  12 +-
 optimum_benchmark/report.py                   |  77 +--
 optimum_benchmark/task_utils.py               |  39 ++
 optimum_benchmark/trackers/latency.py         |   8 +-
 optimum_benchmark/trackers/memory.py          |  28 +-
 optimum_benchmark/utils.py                    | 195 -------
 pyproject.toml                                |  22 +
 requirements.txt                              |   1 +
 setup.py                                      |   4 +-
 tests/configs/base_config.yaml                |  17 +-
 ...stributed_cuda_pytorch_inference_gpt2.yaml |   5 +-
 ...ibuted_cuda_pytorch_training_bert_ddp.yaml |   8 +-
 ...ributed_cuda_pytorch_training_bert_dp.yaml |   2 +-
 tests/test_cli.py                             |  44 +-
 57 files changed, 2212 insertions(+), 2631 deletions(-)
 delete mode 100644 optimum_benchmark/backends/neural_compressor.py
 rename optimum_benchmark/backends/{utils => neural_compressor}/__init__.py (100%)
 create mode 100644 optimum_benchmark/backends/neural_compressor/backend.py
 create mode 100644 optimum_benchmark/backends/neural_compressor/config.py
 create mode 100644 optimum_benchmark/backends/neural_compressor/utils.py
 delete mode 100644 optimum_benchmark/backends/onnxruntime.py
 create mode 100644 optimum_benchmark/backends/onnxruntime/__init__.py
 create mode 100644 optimum_benchmark/backends/onnxruntime/backend.py
 create mode 100644 optimum_benchmark/backends/onnxruntime/config.py
 create mode 100644 optimum_benchmark/backends/onnxruntime/utils.py
 delete mode 100644 optimum_benchmark/backends/openvino.py
 create mode 100644 optimum_benchmark/backends/openvino/__init__.py
 create mode 100644 optimum_benchmark/backends/openvino/backend.py
 create mode 100644 optimum_benchmark/backends/openvino/config.py
 create mode 100644 optimum_benchmark/backends/openvino/utils.py
 rename optimum_benchmark/backends/{utils => }/optimum_utils.py (67%)
 delete mode 100644 optimum_benchmark/backends/pytorch.py
 create mode 100644 optimum_benchmark/backends/pytorch/__init__.py
 create mode 100644 optimum_benchmark/backends/pytorch/backned.py
 create mode 100644 optimum_benchmark/backends/pytorch/config.py
 create mode 100644 optimum_benchmark/backends/pytorch/utils.py
 create mode 100644 optimum_benchmark/backends/utils.py
 delete mode 100644 optimum_benchmark/backends/utils/base_utils.py
 delete mode 100644 optimum_benchmark/backends/utils/neural_compressor_utils.py
 delete mode 100644 optimum_benchmark/backends/utils/onnxruntime_utils.py
 delete mode 100644 optimum_benchmark/backends/utils/openvino_utils.py
 delete mode 100644 optimum_benchmark/backends/utils/pytorch_utils.py
 delete mode 100644 optimum_benchmark/benchmarks/inference_utils.py
 delete mode 100644 optimum_benchmark/benchmarks/training_utils.py
 create mode 100644 optimum_benchmark/benchmarks/utils.py
 create mode 100644 optimum_benchmark/env_utils.py
 create mode 100644 optimum_benchmark/task_utils.py
 delete mode 100644 optimum_benchmark/utils.py
 create mode 100644 pyproject.toml

diff --git a/.gitignore b/.gitignore
index 373be35de..dd49b40dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -159,6 +159,7 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
+.ruff_cache/
 .vscode/
 *.ipynb
 runs/
diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py
index 747e9ac37..6c71d0357 100644
--- a/optimum_benchmark/backends/base.py
+++ b/optimum_benchmark/backends/base.py
@@ -1,55 +1,52 @@
-from typing import Any, ClassVar, Dict, List, Optional, Union, TYPE_CHECKING
-from multiprocessing import Process
-from abc import abstractmethod, ABC
-from dataclasses import dataclass
-from logging import getLogger
-import os
 import gc
-
-
+import os
+import random
 import shutil
-from psutil import cpu_count
-from diffusers import DiffusionPipeline
-from optimum.exporters import TasksManager
-from transformers import (
-    AutoConfig,
-    AutoProcessor,
-    ProcessorMixin,
-    PreTrainedModel,
-    PretrainedConfig,
-    PreTrainedTokenizer,
-    ImageProcessingMixin,
-    FeatureExtractionMixin,
+from abc import ABC
+from dataclasses import dataclass
+from logging import getLogger
+from multiprocessing import Process
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    ClassVar,
+    Dict,
+    Generic,
+    List,
+    Optional,
+    TypeVar,
+    Union,
 )
 
+import numpy as np
+import torch
+from optimum.exporters import TasksManager
+from psutil import cpu_count
+from transformers import AutoConfig, AutoProcessor
 
 if TYPE_CHECKING:
+    from datasets import Dataset
+    from transformers import (
+        Pipeline,
+        PretrainedConfig,
+        PreTrainedModel,
+        TrainerCallback,
+        TrainerState,
+    )
     from transformers.utils import ModelOutput
-    from transformers import TrainerState
 
+    from .utils import PreTrainedProcessor
 
-from .utils.base_utils import (
-    extract_shapes_from_diffusion_pipeline,
-    extract_shapes_from_model_artifacts,
-)
-from ..utils import (
-    DIFFUSION_TASKS,
-    TEXT_GENERATION_TASKS,
+from ..task_utils import DIFFUSION_TASKS, TEXT_GENERATION_TASKS
+from .utils import (
     check_no_process_is_running_on_cuda_device,
     check_only_this_process_is_running_on_cuda_device,
+    extract_shapes_from_diffusion_pipeline,
+    extract_shapes_from_model_artifacts,
 )
 
 
-LOGGER = getLogger("backend")
-
-PreTrainedProcessor = Union[
-    PreTrainedTokenizer,
-    ImageProcessingMixin,
-    FeatureExtractionMixin,
-    ProcessorMixin,
-]
-
-
 @dataclass
 class BackendConfig(ABC):
     name: str
@@ -57,6 +54,7 @@ class BackendConfig(ABC):
     _target_: str
 
     # backend options
+    seed: int = 42
     inter_op_num_threads: Optional[int] = None
     intra_op_num_threads: Optional[int] = None
 
@@ -77,19 +75,25 @@ def __post_init__(self):
                 self.intra_op_num_threads = cpu_count()
 
 
-class Backend(ABC):
-    name: str
-    config: ClassVar[BackendConfig]
+LOGGER = getLogger("backend")
+
+BackendConfigT = TypeVar("BackendConfigT", bound=BackendConfig)
+
+
+class Backend(Generic[BackendConfigT], ABC):
+    NAME: ClassVar[str]
 
-    pretrained_model: Union[PreTrainedModel, DiffusionPipeline]
-    pretrained_processor: Optional[PreTrainedProcessor]
-    pretrained_config: Optional[PretrainedConfig]
+    # instance variables withouth default values https://stackoverflow.com/a/44962662
+    config: BackendConfigT
+    pretrained_model: Union["PreTrainedModel", "Pipeline"]
+    pretrained_processor: Optional["PreTrainedProcessor"]
+    pretrained_config: Optional["PretrainedConfig"]
 
     def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any]):
-        self.model = model
         self.task = task
-        self.device = device
+        self.model = model
         self.hub_kwargs = hub_kwargs
+        self.device = torch.device(device)
 
         if self.is_diffusion_pipeline():
             # for pipelines
@@ -99,8 +103,7 @@ def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any
         else:
             # for models
             self.pretrained_config = AutoConfig.from_pretrained(
-                pretrained_model_name_or_path=self.model,
-                **self.hub_kwargs,
+                pretrained_model_name_or_path=self.model, **self.hub_kwargs
             )
             self.model_type = self.pretrained_config.model_type
 
@@ -108,18 +111,15 @@ def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any
                 # the processor sometimes contains information about the model's
                 # input shapes that's not available in the config
                 self.pretrained_processor = AutoProcessor.from_pretrained(
-                    pretrained_model_name_or_path=self.model,
-                    **self.hub_kwargs,
+                    pretrained_model_name_or_path=self.model, **self.hub_kwargs
                 )
             except ValueError:
                 LOGGER.warning("Could not find the model's preprocessor")
                 self.pretrained_processor = None
 
-        # we're using this one as the default model_class which is used
-        # for exporting the model to onnx for example. Although does suppose that
-        # the model weights are pytorch weights so we might need to change somehow.
         self.automodel_class = TasksManager.get_model_class_for_task(
             task=self.task,
+            framework="pt",
             model_type=self.model_type,
         )
 
@@ -131,18 +131,15 @@ def is_diffusion_pipeline(self) -> bool:
 
     def check_initial_isolation(self) -> None:
         if self.device.type == "cuda":
-            cuda_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
+            cuda_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
             if cuda_devices is None:
                 LOGGER.warning(
-                    "Asked to check the initial device isolation, "
-                    "but the variable CUDA_VISIBLE_DEVICES was not set. "
-                    "Defaulting to checking on the first device."
+                    "Asked to check the initial device(s) isolation, but the variable CUDA_VISIBLE_DEVICES was not set. "
+                    "Defaulting to checking the main device only."
                 )
                 device_ids = {self.device.index if self.device.index is not None else 0}
             else:
-                device_ids = {
-                    int(device_index) for device_index in cuda_devices.split(",")
-                }
+                device_ids = {int(device_index) for device_index in cuda_devices.split(",")}
             check_no_process_is_running_on_cuda_device(device_ids)
 
     def check_continuous_isolation(self) -> None:
@@ -150,15 +147,12 @@ def check_continuous_isolation(self) -> None:
             cuda_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
             if cuda_devices is None:
                 LOGGER.warning(
-                    "Asked to check the continuous device isolation, "
-                    "but the variable CUDA_VISIBLE_DEVICES was not set. "
-                    "Defaulting to checking on the first device."
+                    "Asked to check the continuous device(s) isolation, but the variable CUDA_VISIBLE_DEVICES was not set. "
+                    "Defaulting to checking the main device only."
                 )
                 device_ids = {self.device.index if self.device.index is not None else 0}
             else:
-                device_ids = {
-                    int(device_index) for device_index in cuda_devices.split(",")
-                }
+                device_ids = {int(device_index) for device_index in cuda_devices.split(",")}
 
             self.isolation_thread = Process(
                 target=check_only_this_process_is_running_on_cuda_device,
@@ -167,23 +161,36 @@ def check_continuous_isolation(self) -> None:
             )
             self.isolation_thread.start()
 
-    @abstractmethod
-    def configure(self, config: BackendConfig) -> None:
-        LOGGER.info(f"Configuring {config.name} backend")
+    def configure(self, config: BackendConfigT) -> None:
+        LOGGER.info(f"Configuring {self.NAME} backend")
+        # storing config
         self.config = config
 
+        # seeding backend
+        self.seed()
+
         # isolation options
         if self.config.initial_isolation_check:
-            LOGGER.info("\t+ Checking initial device isolation")
+            LOGGER.info("\t+ Checking initial device(s) isolation")
             self.check_initial_isolation()
         if self.config.continous_isolation_check:
-            LOGGER.info("\t+ Checking contineous device isolation")
+            LOGGER.info("\t+ Checking contineous device(s) isolation")
             self.check_continuous_isolation()
 
         # clean up options
         if self.config.delete_cache:
             LOGGER.info("\t+ Model cache will be deleted after benchmark")
 
+    def seed(self) -> None:
+        # https://pytorch.org/docs/stable/notes/randomness.html
+        random.seed(self.config.seed)
+        np.random.seed(self.config.seed)
+        torch.manual_seed(self.config.seed)
+        torch.cuda.manual_seed_all(self.config.seed)  # safe to call
+        # torch.use_deterministic_algorithms()  # might throw an error
+        # torch.backends.cudnn.deterministic = True # same as above
+        # torch.backends.cudnn.benchmark = False  # might reduce performance
+
     # compiling in openvino requires input shapes
     def prepare_for_inference(self, input_shapes: Dict[str, int]) -> Dict[str, Any]:
         pass
@@ -193,37 +200,20 @@ def prepare_for_profiling(self, input_names: List[str]) -> Dict[str, Any]:
         pass
 
     def forward(self, input: Dict[str, Any], **kwargs) -> "ModelOutput":
-        raise NotImplementedError("Backend must implement forward method")
+        return self.pretrained_model(**input, **kwargs)
 
     def generate(self, input: Dict[str, Any], **kwargs) -> "ModelOutput":
-        raise NotImplementedError("Backend must implement generate method")
-
-    def train(self) -> "TrainerState":
+        return self.pretrained_model.generate(**input, **kwargs)
+
+    def train(
+        self,
+        training_dataset: "Dataset",
+        training_arguments: Dict[str, Any],
+        training_callbacks: List["TrainerCallback"],
+        training_data_collator: Callable,
+    ) -> "TrainerState":
         raise NotImplementedError("Backend must implement train method")
 
-    def delete_pretrained_model(self) -> None:
-        try:
-            del self.pretrained_model
-        except AttributeError:
-            # benchmark might fail before the model is loaded
-            pass
-
-        gc.collect()
-
-    def delete_model_cache(self) -> None:
-        model_cache_path = "models--" + self.model.replace("/", "--")
-        model_cache_path = os.path.join(
-            os.path.expanduser("~/.cache/huggingface/hub"), model_cache_path
-        )
-        shutil.rmtree(model_cache_path, ignore_errors=True)
-
-    def clean(self) -> None:
-        LOGGER.info(f"Cleaning {self.config.name} backend")
-        self.delete_pretrained_model()
-
-        if self.config.delete_cache:
-            self.delete_model_cache()
-
     @property
     def model_shapes(self) -> Dict[str, int]:
         if self.is_diffusion_pipeline():
@@ -237,3 +227,22 @@ def model_shapes(self) -> Dict[str, int]:
             )
 
         return model_shapes
+
+    def delete_pretrained_model(self) -> None:
+        if hasattr(self, "pretrained_model"):
+            del self.pretrained_model
+
+        gc.collect()
+
+    def delete_model_cache(self) -> None:
+        LOGGER.info("\t+ Deleting model cache")
+        model_cache_path = f"models/{self.model}".replace("/", "--")
+        model_cache_path = os.path.join(os.path.expanduser("~/.cache/huggingface/hub"), model_cache_path)
+        shutil.rmtree(model_cache_path, ignore_errors=True)
+
+    def clean(self) -> None:
+        LOGGER.info(f"Cleaning {self.NAME} backend")
+        self.delete_pretrained_model()
+
+        if self.config.delete_cache:
+            self.delete_model_cache()
diff --git a/optimum_benchmark/backends/neural_compressor.py b/optimum_benchmark/backends/neural_compressor.py
deleted file mode 100644
index a1ac95f73..000000000
--- a/optimum_benchmark/backends/neural_compressor.py
+++ /dev/null
@@ -1,195 +0,0 @@
-from typing import Dict, Optional, Any, TYPE_CHECKING
-from tempfile import TemporaryDirectory
-from dataclasses import dataclass
-from logging import getLogger
-
-import torch
-from torch import Tensor
-from hydra.utils import get_class
-from omegaconf import DictConfig, OmegaConf
-from optimum.intel.neural_compressor.quantization import INCQuantizer
-from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS
-from neural_compressor import __version__ as neural_compressor_version
-from neural_compressor.config import (
-    AccuracyCriterion,
-    TuningCriterion,
-    PostTrainingQuantConfig,
-)
-
-if TYPE_CHECKING:
-    from transformers.utils import ModelOutput
-
-from .base import Backend, BackendConfig
-from .utils.neural_compressor_utils import (
-    DEFAULT_QUANTIZATION_CONFIG,
-    DEFAULT_CALIBRATION_CONFIG,
-)
-
-
-LOGGER = getLogger("neural_compressor")
-
-OmegaConf.register_new_resolver("ptq_is_static", lambda approach: approach == "static")
-
-
-@dataclass
-class INCConfig(BackendConfig):
-    name: str = "neural_compressor"
-    version: str = neural_compressor_version
-    _target_: str = "optimum_benchmark.backends.neural_compressor.INCBackend"
-
-    # export options
-    no_weights: bool = False
-
-    # quantization options
-    quantization: bool = False
-    quantization_config: Optional[Dict[str, Any]] = None
-
-    # calibration options
-    calibration: bool = False
-    calibration_config: Optional[Dict[str, Any]] = None
-
-    def __post_init__(self):
-        if self.no_weights:
-            # TODO: implement no_weights for neural_compressor backend if possible
-            raise NotImplementedError(
-                "no_weights is not supported for neural_compressor backend"
-            )
-
-        if self.quantization:
-            self.quantization_config = OmegaConf.merge(
-                self.quantization_config if self.quantization_config else {},
-                DEFAULT_QUANTIZATION_CONFIG,
-            )
-            if self.calibration_config["approach"] == "static":
-                self.calibration = True
-
-        if self.calibration:
-            self.calibration_config = OmegaConf.merge(
-                self.calibration_config if self.calibration_config else {},
-                DEFAULT_CALIBRATION_CONFIG,
-            )
-
-
-class INCBackend(Backend):
-    name: str = "neural_compressor"
-    config: INCConfig
-
-    def __init__(
-        self, model: str, task: str, device: str, hub_kwargs: DictConfig
-    ) -> None:
-        super().__init__(model, task, device, hub_kwargs)
-        self.device = torch.device(device)
-
-        assert self.task in _HEAD_TO_AUTOMODELS, (
-            f"INCBackend does not support task {self.task} yet. "
-            f"Supported tasks are: {list(_HEAD_TO_AUTOMODELS.keys())}"
-        )
-
-        self.incmodel_class = get_class(
-            f"optimum.intel.neural_compressor.{_HEAD_TO_AUTOMODELS[self.task]}"
-        )
-        LOGGER.info(
-            f"\t+ Infered INCModel class {self.incmodel_class.__name__} "
-            f"for task {self.task} and model_type {self.model_type}"
-        )
-
-    def configure(self, config: INCConfig) -> None:
-        super().configure(config)
-
-        if self.config.quantization:
-            self.config.quantization_config["accuracy_criterion"] = AccuracyCriterion(
-                **self.config.quantization_config["accuracy_criterion"]
-            )
-            self.config.quantization_config["tuning_criterion"] = TuningCriterion(
-                **self.config.quantization_config["tuning_criterion"]
-            )
-            self.quantization_config = PostTrainingQuantConfig(
-                **self.config.quantization_config
-            )
-
-        if self.config.calibration:
-            self.config.calibration_config["preprocess_class"] = get_class(
-                self.config.calibration_config["preprocess_class"]
-            )
-            self.config.calibration_config[
-                "preprocess_function"
-            ] = self.config.calibration_config["preprocess_class"](
-                model_name_or_path=self.model
-            )
-            self.config.calibration_config.pop("preprocess_class")
-
-        with TemporaryDirectory() as tmpdirname:
-            if self.config.quantization:
-                self.load_and_quantize_automodel(tmpdirname)
-            else:
-                self.load_incmodel()
-
-    def load_and_quantize_automodel(self, tmpdirname: str) -> None:
-        LOGGER.info("\t+ Loading pretrained AutoModel")
-        model = self.automodel_class.from_pretrained(self.model, **self.hub_kwargs)
-        LOGGER.info("\t+ Creating quantizer")
-        quantizer = INCQuantizer.from_pretrained(
-            model,
-            eval_fn=None,
-            calibration_fn=None,
-            task=self.task,
-        )
-
-        if self.config.calibration:
-            LOGGER.info("\t+ Loading calibration dataset")
-            calibration_dataset = quantizer.get_calibration_dataset(
-                **self.config.calibration_config
-            )
-        else:
-            calibration_dataset = None
-
-        LOGGER.info("\t+ Attempting quantization")
-        quantizer.quantize(
-            quantization_config=self.config.quantization_config,
-            save_directory=f"{tmpdirname}/quantized",
-            calibration_dataset=calibration_dataset,
-            # default values
-            batch_size=8,
-            data_collator=None,
-            remove_unused_columns=True,
-            file_name=None,
-        )
-
-        LOGGER.info("\t+ Loading quantized INCModel")
-        self.pretrained_model = self.incmodel_class.from_pretrained(
-            model_name_or_path=f"{tmpdirname}/quantized",
-        )
-
-    def load_incmodel(self) -> None:
-        if self.is_diffusion_pipeline():
-            self.pretrained_model = self.incmodel_class.from_pretrained(
-                model_name_or_path=self.model,
-                **self.hub_kwargs,
-            )
-            self.pretrained_model.to(self.device)
-        elif self.is_text_generation_model():
-            self.pretrained_model = self.incmodel_class.from_pretrained(
-                # for some reason only causalLM expects 
-                # model_id instead of model_name_or_path
-                model_id=self.model,
-                device_map=self.device,
-                **self.hub_kwargs,
-            )
-        else:
-            self.pretrained_model = self.incmodel_class.from_pretrained(
-                # for some reason only causalLM expects 
-                # model_id instead of model_name_or_path
-                model_name_or_path=self.model,
-                device_map=self.device,
-                **self.hub_kwargs,
-            )
-
-    def forward(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput":
-        output = self.pretrained_model(**input, **kwargs)
-
-        return output
-
-    def generate(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput":
-        output = self.pretrained_model.generate(**input, **kwargs)
-
-        return output
diff --git a/optimum_benchmark/backends/utils/__init__.py b/optimum_benchmark/backends/neural_compressor/__init__.py
similarity index 100%
rename from optimum_benchmark/backends/utils/__init__.py
rename to optimum_benchmark/backends/neural_compressor/__init__.py
diff --git a/optimum_benchmark/backends/neural_compressor/backend.py b/optimum_benchmark/backends/neural_compressor/backend.py
new file mode 100644
index 000000000..5e35dffb4
--- /dev/null
+++ b/optimum_benchmark/backends/neural_compressor/backend.py
@@ -0,0 +1,107 @@
+from logging import getLogger
+from tempfile import TemporaryDirectory
+from typing import Any, Dict
+
+from hydra.utils import get_class
+from neural_compressor.config import (
+    AccuracyCriterion,
+    PostTrainingQuantConfig,
+    TuningCriterion,
+)
+from optimum.intel.neural_compressor.quantization import INCQuantizer
+
+from ..base import Backend
+from .config import INCConfig
+from .utils import TASKS_TO_INCMODELS
+
+LOGGER = getLogger("neural_compressor")
+
+
+class INCBackend(Backend[INCConfig]):
+    NAME: str = "neural_compressor"
+
+    def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any]) -> None:
+        super().__init__(model, task, device, hub_kwargs)
+        self.validate_device()
+        self.validate_task()
+
+        self.incmodel_class = get_class(TASKS_TO_INCMODELS[self.task])
+        LOGGER.info(
+            f"\t+ Infered INCModel {self.incmodel_class.__name__} for task {self.task} and model_type {self.model_type}"
+        )
+
+    def validate_device(self) -> None:
+        if self.device.type != "cpu":
+            raise ValueError(f"INCBackend only supports CPU devices, got {self.device.type}")
+
+    def validate_task(self) -> None:
+        if self.task not in TASKS_TO_INCMODELS:
+            raise NotImplementedError(f"INCBackend does not support task {self.task}")
+
+    def configure(self, config: INCConfig) -> None:
+        super().configure(config)
+
+        self.tmpdir = TemporaryDirectory()
+
+        if self.config.ptq_quantization:
+            self.load_automodel_from_pretrained()
+            self.quantize_automodel()
+            self.delete_pretrained_model()
+
+        self.load_incmodel_from_pretrained()
+
+    def load_automodel_from_pretrained(self) -> None:
+        LOGGER.info("\t+ Loading AutoModel")
+        self.pretrained_model = self.automodel_class.from_pretrained(self.model, **self.hub_kwargs)
+
+    def load_incmodel_from_pretrained(self) -> None:
+        LOGGER.info("\t+ Loading INCModel")
+        self.pretrained_model = self.incmodel_class.from_pretrained(self.model, **self.hub_kwargs)
+
+    def quantize_automodel(self) -> None:
+        LOGGER.info("\t+ Attempting to quantize model")
+        quantized_model_path = f"{self.tmpdir.name}/quantized"
+        LOGGER.info("\t+ Processing quantization config")
+        ptq_quantization_config = self.config.ptq_quantization_config.copy()
+        ptq_quantization_config["accuracy_criterion"] = AccuracyCriterion(
+            **ptq_quantization_config["accuracy_criterion"]
+        )
+        ptq_quantization_config["tuning_criterion"] = TuningCriterion(**ptq_quantization_config["tuning_criterion"])
+        ptq_quantization_config = PostTrainingQuantConfig(**ptq_quantization_config)
+        LOGGER.info("\t+ Creating quantizer")
+        quantizer = INCQuantizer.from_pretrained(
+            self.pretrained_model,
+            task=self.task,
+            seed=self.config.seed,
+            # TODO: add support for these
+            eval_fn=None,
+            calibration_fn=None,
+        )
+
+        if self.config.calibration:
+            LOGGER.info("\t+ Processing calibration config")
+            calibration_config = self.config.calibration_config.copy()
+            preprocess_class = get_class(calibration_config.pop("preprocess_class"))
+            calibration_config["preprocess_function"] = preprocess_class(model_name_or_path=self.model)
+            LOGGER.info("\t+ Loading calibration dataset")
+            calibration_dataset = quantizer.get_calibration_dataset(**calibration_config)
+        else:
+            calibration_dataset = None
+
+        LOGGER.info("\t+ Quantizing model")
+        quantizer.quantize(
+            quantization_config=ptq_quantization_config,
+            save_directory=quantized_model_path,
+            calibration_dataset=calibration_dataset,
+            # TODO: add support for these
+            remove_unused_columns=True,
+            data_collator=None,
+            file_name=None,
+            batch_size=8,
+        )
+        self.model = quantized_model_path
+
+    def clean(self) -> None:
+        super().clean()
+        if hasattr(self, "tmpdir"):
+            self.tmpdir.cleanup()
diff --git a/optimum_benchmark/backends/neural_compressor/config.py b/optimum_benchmark/backends/neural_compressor/config.py
new file mode 100644
index 000000000..1a1fcb845
--- /dev/null
+++ b/optimum_benchmark/backends/neural_compressor/config.py
@@ -0,0 +1,88 @@
+import importlib.metadata
+from dataclasses import dataclass, field
+from typing import Any, Dict
+
+from omegaconf import OmegaConf
+
+from ..base import BackendConfig
+
+OmegaConf.register_new_resolver(
+    "neural_compressor_version",
+    lambda: importlib.metadata.version("neural_compressor"),
+)
+
+# https://github.com/intel/neural-compressor/blob/master/neural_compressor/config.py#L490
+ACCURACY_CRITERION_CONFIG = {
+    "higher_is_better": True,
+    "criterion": "relative",
+    "tolerable_loss": 0.01,
+}
+
+# https://github.com/intel/neural-compressor/blob/master/neural_compressor/config.py#L593
+TUNING_CRITERION_CONFIG = {
+    "strategy": "basic",
+    "strategy_kwargs": None,
+    "timeout": 0,
+    "max_trials": 100,
+    "objective": "performance",
+}
+
+# https://github.com/intel/neural-compressor/blob/master/neural_compressor/config.py#L1242
+PTQ_QUANTIZATION_CONFIG = {
+    "device": "cpu",
+    "backend": "default",
+    "domain": "auto",
+    "recipes": {},
+    "quant_format": "default",
+    "inputs": [],
+    "outputs": [],
+    "approach": "static",
+    "calibration_sampling_size": [100],
+    "op_type_dict": None,
+    "op_name_dict": None,
+    "reduce_range": None,
+    "example_inputs": None,
+    "excluded_precisions": [],
+    "quant_level": "auto",
+    "accuracy_criterion": ACCURACY_CRITERION_CONFIG,
+    "tuning_criterion": TUNING_CRITERION_CONFIG,
+    "diagnosis": False,
+}
+
+
+CALIBRATION_CONFIG = {
+    "dataset_name": "glue",
+    "num_samples": 300,
+    "dataset_config_name": "sst2",
+    "dataset_split": "train",
+    "preprocess_batch": True,
+    "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor",
+}
+
+
+@dataclass
+class INCConfig(BackendConfig):
+    name: str = "neural_compressor"
+    version: str = "${neural_compressor_version:}"
+    _target_: str = "optimum_benchmark.backends.neural_compressor.backend.INCBackend"
+
+    # post-training quantization options
+    ptq_quantization: bool = False
+    ptq_quantization_config: Dict[str, Any] = field(default_factory=dict)
+
+    # calibration options
+    calibration: bool = False
+    calibration_config: Dict[str, Any] = field(default_factory=dict)
+
+    def __post_init__(self):
+        if self.ptq_quantization:
+            self.ptq_quantization_config = OmegaConf.to_container(
+                OmegaConf.merge(PTQ_QUANTIZATION_CONFIG, self.ptq_quantization_config)
+            )
+            if self.ptq_quantization_config["approach"] == "static" and not self.calibration:
+                raise ValueError("Calibration must be enabled when using static quantization.")
+
+        if self.calibration:
+            self.calibration_config = OmegaConf.to_container(
+                OmegaConf.merge(CALIBRATION_CONFIG, self.calibration_config)
+            )
diff --git a/optimum_benchmark/backends/neural_compressor/utils.py b/optimum_benchmark/backends/neural_compressor/utils.py
new file mode 100644
index 000000000..beb999771
--- /dev/null
+++ b/optimum_benchmark/backends/neural_compressor/utils.py
@@ -0,0 +1,5 @@
+from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS
+
+TASKS_TO_INCMODELS = {
+    task: f"optimum.intel.neural_compressor.{incmodel_name}" for task, incmodel_name in _HEAD_TO_AUTOMODELS.items()
+}
diff --git a/optimum_benchmark/backends/onnxruntime.py b/optimum_benchmark/backends/onnxruntime.py
deleted file mode 100644
index 57e811706..000000000
--- a/optimum_benchmark/backends/onnxruntime.py
+++ /dev/null
@@ -1,505 +0,0 @@
-from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING
-from tempfile import TemporaryDirectory
-from dataclasses import dataclass
-from logging import getLogger
-from datasets import Dataset
-import os
-
-
-import torch
-from torch import Tensor
-from omegaconf import OmegaConf
-from hydra.utils import get_class
-from onnxruntime import SessionOptions
-from accelerate import init_empty_weights
-from optimum.pipelines import ORT_SUPPORTED_TASKS
-from onnxruntime import __version__ as onnxruntime_version
-from optimum.onnxruntime.configuration import (
-    OptimizationConfig,
-    QuantizationConfig,
-    AutoCalibrationConfig,
-    AutoOptimizationConfig,
-    AutoQuantizationConfig,
-)
-from optimum.onnxruntime import (
-    ORTOptimizer,
-    ORTQuantizer,
-    ORTTrainer,
-    ORTTrainingArguments,
-)
-
-if TYPE_CHECKING:
-    from transformers import TrainerCallback, TrainerState
-    from transformers.modeling_outputs import ModelOutput
-
-
-from .base import Backend, BackendConfig
-from .utils.optimum_utils import main_export
-from .utils.pytorch_utils import randomize_weights
-from ..profilers.ort_profiler import ORTProfilingWrapper
-from .utils.onnxruntime_utils import (
-    format_ort_quantization_dict,
-    infer_device_id,
-    DEFAULT_OPTIMIZATION_CONFIG,
-    DEFAULT_QUANTIZATION_CONFIG,
-    DEFAULT_CALIBRATION_CONFIG,
-)
-
-
-OmegaConf.register_new_resolver(
-    "is_gpu",
-    lambda device: "cuda" in device.lower() or "tensorrt" in device.lower(),
-)
-OmegaConf.register_new_resolver(
-    "is_profiling",
-    lambda benchmark_name: benchmark_name == "profiling",
-)
-OmegaConf.register_new_resolver(
-    "infer_provider",
-    lambda device: f"{torch.device(device).type.upper()}ExecutionProvider",
-)
-OmegaConf.register_new_resolver(
-    "infer_device_id",
-    lambda device: infer_device_id(device),
-)
-
-LOGGER = getLogger("onnxruntime")
-
-
-@dataclass
-class ORTConfig(BackendConfig):
-    name: str = "onnxruntime"
-    version: str = onnxruntime_version
-    _target_: str = "optimum_benchmark.backends.onnxruntime.ORTBackend"
-
-    # export options
-    export: bool = True
-    no_weights: bool = False
-    use_merged: bool = False
-    use_cache: bool = True
-    torch_dtype: Optional[str] = None
-
-    # provider options
-    provider: str = "${infer_provider:${device}}"
-    provider_options: Optional[Dict] = None
-    # TODO: deprecate device_id in favor of provider_options
-    device_id: Optional[int] = "${infer_device_id:${device}}"
-
-    # inference options
-    use_io_binding: bool = "${is_gpu:${device}}"
-    session_options: Optional[Dict] = None
-    # TODO: deprecate enable_profiling in favor of session_options
-    enable_profiling: bool = "${is_profiling:${benchmark.name}}"
-
-    # optimization options
-    optimization: bool = False
-    optimization_config: Optional[Dict] = None
-
-    # O1, O2, O3, O4
-    auto_optimization: Optional[str] = None
-    auto_optimization_config: Optional[Dict] = None
-
-    # quantization options
-    quantization: bool = False
-    quantization_config: Optional[Dict] = None
-
-    # arm64,avx2,avx512,avx512_vnni,tensorrt
-    auto_quantization: Optional[str] = None
-    auto_quantization_config: Optional[Dict] = None
-
-    # calibration options
-    calibration: bool = False
-    calibration_config: Optional[Dict] = None
-
-    # this will skip exporting the model and will use automodel with trainer
-    use_ortmodel: bool = "${is_inference:${benchmark.name}}"
-
-    def __post_init__(self):
-        if self.optimization:
-            self.optimization_config = OmegaConf.merge(
-                self.optimization_config or {},
-                DEFAULT_OPTIMIZATION_CONFIG,
-            )
-
-        if self.auto_optimization is not None:
-            self.auto_optimization_config = OmegaConf.merge(
-                self.auto_optimization_config or {},
-                DEFAULT_OPTIMIZATION_CONFIG,
-            )
-            self.auto_optimization_config.pop("optimization_level", None)
-            self.auto_optimization_config[
-                "for_gpu"
-            ] = self.auto_optimization_config.pop("optimize_for_gpu")
-
-        if self.quantization:
-            self.quantization_config = OmegaConf.merge(
-                self.quantization_config or {},
-                DEFAULT_QUANTIZATION_CONFIG,
-            )
-
-        # auto quantization is needs specific config for each type
-        # if self.auto_quantization is not None:
-        #     self.auto_quantization_config = OmegaConf.merge(
-        #         self.auto_quantization_config or {},
-        #         DEFAULT_QUANTIZATION_CONFIG,
-        #     )
-
-        if self.quantization_config is not None:
-            self.calibration = self.quantization_config["is_static"]
-
-        if self.auto_quantization_config is not None:
-            self.calibration = self.auto_quantization_config["is_static"]
-
-        if self.calibration:
-            self.calibration_config = OmegaConf.merge(
-                self.calibration_config or {},
-                DEFAULT_CALIBRATION_CONFIG,
-            )
-
-        if self.device_id is not None:
-            LOGGER.warning(
-                "device_id is deprecated, please use provider_options instead"
-            )
-            self.provider_options = OmegaConf.merge(
-                self.provider_options or {},
-                {"device_id": self.device_id},
-            )
-
-        if self.enable_profiling is not None:
-            LOGGER.warning(
-                "enable_profiling is deprecated, please use session_options instead"
-            )
-            self.session_options = OmegaConf.merge(
-                self.session_options or {},
-                {"enable_profiling": self.enable_profiling},
-            )
-
-
-class ORTBackend(Backend):
-    name: str = "onnxruntime"
-    config: ORTConfig
-
-    def __init__(
-        self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any]
-    ) -> None:
-        super().__init__(model, task, device, hub_kwargs)
-        self.device = torch.device(device)
-
-        if self.task == "stable-diffusion":
-            self.ortmodel_class = get_class(
-                "optimum.onnxruntime.ORTStableDiffusionPipeline"
-            )
-        elif self.task == "stable-diffusion-xl":
-            self.ortmodel_class = get_class(
-                "optimum.onnxruntime.ORTStableDiffusionXLPipeline"
-            )
-        elif self.task in ORT_SUPPORTED_TASKS:
-            self.ortmodel_class = ORT_SUPPORTED_TASKS[self.task]["class"][0]
-        else:
-            raise ValueError(f"Unsupported task {self.task}")
-
-        LOGGER.info(
-            f"\t+ Infered ORTModel class {self.ortmodel_class.__name__} "
-            f"for task {self.task} and model_type {self.model_type}"
-        )
-
-    def configure(self, config: ORTConfig) -> None:
-        super().configure(config)
-
-        # session options
-        session_options = SessionOptions()
-        if self.config.intra_op_num_threads is not None:
-            LOGGER.info(
-                f"\t+ Setting intra_op_num_threads({config.intra_op_num_threads})"
-            )
-            self.config.session_options.intra_op_num_threads = (
-                self.config.intra_op_num_threads
-            )
-        if self.config.inter_op_num_threads is not None:
-            LOGGER.info(
-                f"\t+ Setting inter_op_num_threads({config.inter_op_num_threads})"
-            )
-            self.config.session_options.inter_op_num_threads = (
-                self.config.inter_op_num_threads
-            )
-        for key, value in self.config.session_options.items():
-            setattr(session_options, key, value)
-        self.config.session_options = session_options
-
-        # Set torch dtype
-        self.config.torch_dtype = (
-            getattr(torch, self.config.torch_dtype)  # in case of torch.dtype
-            if self.config.torch_dtype is not None
-            and hasattr(torch, self.config.torch_dtype)
-            else self.config.torch_dtype
-        )
-
-        with TemporaryDirectory() as tmpdirname:
-            if self.config.use_ortmodel:
-                if self.config.no_weights:
-                    self.load_ortmodel_from_config(tmpdirname)
-                else:
-                    self.load_ortmodel_from_pretrained(tmpdirname)
-            else:
-                if self.config.no_weights:
-                    self.load_automodel_from_config()
-                else:
-                    self.load_automodel_from_pretrained()
-
-    def load_ortmodel_from_config(self, tmpdirname: str) -> None:
-        LOGGER.info("\t+ Creating random weights model")
-        self.load_automodel_from_config()
-
-        LOGGER.info("\t+ Exporting model to onnx")
-        main_export(
-            model_name_or_path=self.model,
-            output=f"{tmpdirname}/exported_model",
-            # with "auto" the taks manager will infer the same task
-            # we're using but will add "-with-past" when possible
-            task="auto",
-            device=self.device.type,
-            fp16=self.config.torch_dtype == torch.float16,
-            optimize=self.config.auto_optimization,
-            no_post_process=not self.config.use_merged,
-            do_validation=False,
-            **self.hub_kwargs,
-            # we hijack the model instantiation and use our random weights model
-            model=self.pretrained_model,
-        )
-        self.delete_pretrained_model()
-
-        LOGGER.info("\t+ Loading exported model with ORTModel")
-        self.pretrained_model = self.ortmodel_class.from_pretrained(
-            model_id=f"{tmpdirname}/exported_model",
-            session_options=self.config.session_options,
-            use_io_binding=self.config.use_io_binding,
-            provider=self.config.provider,
-            provider_options=self.config.provider_options,
-            **(
-                {
-                    "use_merged": self.config.use_merged,
-                    "use_cache": self.config.use_cache,
-                }
-                if self.is_text_generation_model()
-                else {}
-            ),
-            export=False,
-            **self.hub_kwargs,
-        )
-
-        if self.config.optimization:
-            raise NotImplementedError(
-                "Only AutoOptimization is supported when "
-                "loading a model with random weights"
-            )
-
-        if self.config.quantization or self.config.auto_quantization is not None:
-            self.quantize(tmpdirname)
-
-    def load_ortmodel_from_pretrained(self, tmpdirname: str) -> None:
-        if (
-            self.config.torch_dtype is not None
-            and self.config.torch_dtype != torch.float32
-        ):
-            raise NotImplementedError(
-                "Loading with ORTModel is only supported "
-                "with torch_dtype float32 for now"
-            )
-
-        self.pretrained_model = self.ortmodel_class.from_pretrained(
-            model_id=self.model,
-            session_options=self.config.session_options,
-            use_io_binding=self.config.use_io_binding,
-            provider=self.config.provider,
-            provider_options=self.config.provider_options,
-            export=self.config.export,
-            **(
-                {
-                    "use_merged": self.config.use_merged,
-                    "use_cache": self.config.use_cache,
-                }
-                if self.is_text_generation_model()
-                else {}
-            ),
-            **self.hub_kwargs,
-        )
-
-        if self.config.optimization or self.config.auto_optimization is not None:
-            self.optimize(tmpdirname)
-
-        if self.config.quantization or self.config.auto_quantization is not None:
-            self.quantize(tmpdirname)
-
-    def optimize(self, tmpdirname: str) -> None:
-        if self.config.auto_optimization is not None:
-            LOGGER.info(f"\t+ Using auto optimization {self.config.auto_optimization}")
-            optimization_dict = OmegaConf.to_container(
-                self.config.auto_optimization_config, resolve=True
-            )
-            LOGGER.info("\t+ Setting auto optimization parameters:")
-            for key, value in optimization_dict.items():  # type: ignore
-                LOGGER.info(f"\t\t+ {key}: {value}")
-
-            optimization_config = AutoOptimizationConfig.with_optimization_level(
-                optimization_level=self.config.auto_optimization, **optimization_dict
-            )
-        else:
-            optimization_dict = OmegaConf.to_container(
-                self.config.optimization_config, resolve=True
-            )
-            LOGGER.info("\t+ Setting optimization parameters:")
-            for key, value in optimization_dict.items():  # type: ignore
-                LOGGER.info(f"\t\t+ {key}: {value}")
-            optimization_config = OptimizationConfig(**optimization_dict)
-
-        LOGGER.info("\t+ Attempting optimization")
-        optimizer = ORTOptimizer.from_pretrained(self.pretrained_model)
-        optimizer.optimize(
-            save_dir=f"{tmpdirname}/optimized",
-            optimization_config=optimization_config,
-        )
-        self.delete_pretrained_model()
-
-        LOGGER.info("\t+ Loading optimized model")
-        self.pretrained_model = self.ortmodel_class.from_pretrained(
-            model_id=f"{tmpdirname}/optimized",
-            session_options=self.config.session_options,
-            use_io_binding=self.config.use_io_binding,
-            provider=self.config.provider,
-            provider_options=self.config.provider_options,
-        )
-
-    def quantize(self, tmpdirname: str) -> None:
-        if self.config.auto_quantization is not None:
-            LOGGER.info(f"\t+ Using auto quantization {self.config.auto_quantization}")
-            auto_quantization_config_class = getattr(
-                AutoQuantizationConfig, self.config.auto_quantization
-            )
-            quantization_dict = OmegaConf.to_container(
-                self.config.auto_quantization_config, resolve=True
-            )
-            quantization_dict = format_ort_quantization_dict(quantization_dict)
-            quantization_config = auto_quantization_config_class(**quantization_dict)
-
-        else:
-            LOGGER.info("\t+ Using manual quantization")
-            quantization_dict = OmegaConf.to_container(
-                self.config.quantization_config, resolve=True
-            )
-            quantization_dict = format_ort_quantization_dict(quantization_dict)
-            quantization_config = QuantizationConfig(**quantization_dict)
-
-        LOGGER.info("\t+ Attempting quantization")
-        model_dir = self.pretrained_model.model_save_dir
-        components = [file for file in os.listdir(model_dir) if file.endswith(".onnx")]
-        for component in components:
-            LOGGER.info(f"\t+ Quantizing {component}")
-            quantizer = ORTQuantizer.from_pretrained(model_dir, file_name=component)
-
-            if self.config.calibration:
-                preprocess_class = get_class(
-                    self.config.calibration_config.preprocess_class
-                )
-                preprocess_function = preprocess_class(model_name_or_path=self.model)
-
-                calibration_dataset = quantizer.get_calibration_dataset(
-                    dataset_name=self.config.calibration_config.dataset_name,
-                    num_samples=self.config.calibration_config.num_samples,
-                    dataset_config_name=self.config.calibration_config.dataset_config_name,
-                    dataset_split=self.config.calibration_config.dataset_split,
-                    preprocess_function=preprocess_function,
-                )
-
-                # Create the calibration configuration
-                # containing the parameters related to calibration.
-                calibration_config = AutoCalibrationConfig.minmax(calibration_dataset)
-
-                # Perform the calibration step:
-                # computes the activations quantization ranges
-                calibration_tensors_range = quantizer.fit(
-                    dataset=calibration_dataset,
-                    calibration_config=calibration_config,
-                    operators_to_quantize=quantization_config.operators_to_quantize,
-                )
-
-            quantizer.quantize(
-                save_dir=f"{tmpdirname}/quantized",
-                calibration_tensors_range=calibration_tensors_range,
-                quantization_config=quantization_config,
-            )
-        self.delete_pretrained_model()
-
-        LOGGER.info("\t+ Loading quantized model")
-        self.pretrained_model = self.ortmodel_class.from_pretrained(
-            model_id=f"{tmpdirname}/quantized",
-            session_options=self.config.session_options,
-            use_io_binding=self.config.use_io_binding,
-            provider=self.config.provider,
-            provider_options=self.config.provider_options,
-        )
-
-    def load_automodel_from_config(self) -> None:
-        with init_empty_weights():
-            self.pretrained_model = self.automodel_class.from_config(
-                config=self.pretrained_config,
-                torch_dtype=self.config.torch_dtype,
-                trust_remote_code=self.hub_kwargs.get("trust_remote_code", False),
-            )
-        self.pretrained_model.to_empty(device=self.device)
-        randomize_weights(self.pretrained_model)
-
-    def load_automodel_from_pretrained(self) -> None:
-        with self.device:
-            self.pretrained_model = self.automodel_class.from_pretrained(
-                pretrained_model_name_or_path=self.model,
-                torch_dtype=self.config.torch_dtype,
-                **self.hub_kwargs,
-            )
-
-    def prepare_for_profiling(self, input_names: List[str]) -> None:
-        LOGGER.info("Preparing model for profiling")
-        LOGGER.info("\t+ Wrapping model inside profiler")
-        self.pretrained_model = ORTProfilingWrapper(self.pretrained_model)
-
-    def forward(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput":
-        output = self.pretrained_model(**input, **kwargs)
-
-        return output
-
-    def generate(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput":
-        output = self.pretrained_model.generate(**input, **kwargs)
-        return output
-
-    def train(
-        self,
-        training_dataset: "Dataset",
-        training_arguments: Dict[str, Any],
-        training_callbacks: List["TrainerCallback"],
-        training_data_collator: Callable,
-    ) -> "TrainerState":
-        LOGGER.info("\t+ Setting dataset format to `torch`.")
-        training_dataset.set_format(
-            type="torch", columns=list(training_dataset.features.keys())
-        )
-
-        LOGGER.info(
-            "\t+ Wrapping training arguments with "
-            "optimum.onnxruntime.ORTTrainingArguments"
-        )
-        training_arguments = ORTTrainingArguments(**training_arguments)
-
-        LOGGER.info("\t+ Wrapping model with optimum.onnxruntime.ORTTrainer")
-        trainer = ORTTrainer(
-            model=self.pretrained_model,
-            args=training_arguments,
-            callbacks=training_callbacks,
-            train_dataset=training_dataset,
-            data_collator=training_data_collator,
-        )
-
-        LOGGER.info("\t+ Starting training")
-        trainer.train()
-        LOGGER.info("\t+ Training finished successfully")
-        trainer_state = trainer.state
-
-        return trainer_state
diff --git a/optimum_benchmark/backends/onnxruntime/__init__.py b/optimum_benchmark/backends/onnxruntime/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/optimum_benchmark/backends/onnxruntime/backend.py b/optimum_benchmark/backends/onnxruntime/backend.py
new file mode 100644
index 000000000..a77ad8ad9
--- /dev/null
+++ b/optimum_benchmark/backends/onnxruntime/backend.py
@@ -0,0 +1,332 @@
+import os
+from logging import getLogger
+from tempfile import TemporaryDirectory
+from typing import TYPE_CHECKING, Any, Callable, Dict, List
+
+import torch
+from accelerate import init_empty_weights
+from hydra.utils import get_class
+from onnxruntime import SessionOptions
+from optimum.onnxruntime import (
+    ONNX_DECODER_NAME,
+    ONNX_DECODER_WITH_PAST_NAME,
+    ORTOptimizer,
+    ORTQuantizer,
+    ORTTrainer,
+    ORTTrainingArguments,
+)
+from optimum.onnxruntime.configuration import (
+    AutoCalibrationConfig,
+    AutoOptimizationConfig,
+    AutoQuantizationConfig,
+    OptimizationConfig,
+    QuantizationConfig,
+)
+
+if TYPE_CHECKING:
+    from datasets import Dataset
+    from transformers import TrainerCallback, TrainerState
+
+from ...profilers.ort_profiler import ORTProfilingWrapper
+from ..base import Backend
+from ..optimum_utils import main_export
+from ..pytorch.utils import randomize_weights
+from .config import ORTConfig
+from .utils import TASKS_TO_ORTMODELS, TASKS_TO_ORTSD, format_quantization_config
+
+LOGGER = getLogger("onnxruntime")
+
+
+class ORTBackend(Backend[ORTConfig]):
+    NAME: str = "onnxruntime"
+
+    def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any]) -> None:
+        super().__init__(model, task, device, hub_kwargs)
+        self.validate_device()
+        self.validate_task()
+
+        if self.is_diffusion_pipeline():
+            self.ortmodel_class = get_class(TASKS_TO_ORTSD[self.task])
+        elif self.task in TASKS_TO_ORTMODELS:
+            self.ortmodel_class = TASKS_TO_ORTMODELS[self.task]
+
+        ortmodel_name = self.ortmodel_class.__name__
+        LOGGER.info(
+            f"\t+ Infered ORTModel class {ortmodel_name} for task {self.task} and model_type {self.model_type}"
+        )
+
+    def validate_device(self) -> None:
+        if self.device.type not in ["cpu", "cuda"]:
+            raise ValueError(f"ORTBackend only supports CPU and CUDA devices, got {self.device.type}")
+
+    def validate_task(self) -> None:
+        if self.task not in TASKS_TO_ORTMODELS and self.task not in TASKS_TO_ORTSD:
+            raise NotImplementedError(f"ORTBackend does not support task {self.task}")
+
+    def configure(self, config: ORTConfig) -> None:
+        super().configure(config)
+
+        # Process torch dtype
+        self.torch_dtype = getattr(torch, self.config.torch_dtype) if self.config.torch_dtype is not None else None
+
+        ###### Training with ORTModule ######
+        # ort-training is basically a different package so we might need to seperate these two backends in the future
+        if not self.config.use_inference_session:
+            if self.config.no_weights:
+                self.load_automodel_from_config()
+            else:
+                self.load_automodel_from_pretrained()
+            return
+
+        ###### Inference with ORTModelForxxx ######
+        # Inference session options
+        self.session_options = SessionOptions()
+        for key, value in self.config.session_options.items():
+            setattr(self.session_options, key, value)
+
+        # Exporting, optimizing, post-processing and quantizing with ORTModelForxxx
+        self.tmpdir = TemporaryDirectory()
+
+        # Some statefullness to handle the different combinations of options
+        self.export = self.config.export
+        self.use_merged = self.config.use_merged
+
+        if self.is_diffusion_pipeline():
+            self.load_ortmodel()
+            # early exit because nothing of the following can be applied to diffusion pipelines
+            return
+
+        if self.config.no_weights:
+            self.load_automodel_from_config()  # creates dummy automodel
+            self.export_automodel()  # exports automodel
+            self.export = False
+        else:
+            if self.config.export:
+                self.use_merged = False  # merging is handeled seperately
+                self.load_automodel_from_pretrained()  # creates automodel from pretrained
+                self.export_automodel()  # exports automodel
+                self.export = False
+
+        self.delete_pretrained_model()  # deletes automodel
+
+        if self.config.auto_optimization or self.config.optimization:
+            self.optimize_onnx_files()
+
+        if self.config.use_merged:
+            self.merge_onnx_files()
+            self.use_merged = True
+
+        if self.config.auto_quantization or self.config.quantization:
+            self.quantize_onnx_files()
+
+        self.load_ortmodel()
+        self.tmpdir.cleanup()
+
+    def load_automodel_from_config(self) -> None:
+        LOGGER.info("\t+ Loading AutoModel from config")
+        with init_empty_weights():
+            self.pretrained_model = self.automodel_class.from_config(
+                self.pretrained_config,
+                torch_dtype=self.torch_dtype,
+                trust_remote_code=self.hub_kwargs.get("trust_remote_code", False),
+            )
+        self.pretrained_model.to_empty(device=self.device)
+        randomize_weights(self.pretrained_model)
+
+    def load_automodel_from_pretrained(self) -> None:
+        LOGGER.info("\t+ Loading AutoModel from pretrained")
+        with self.device:
+            self.pretrained_model = self.automodel_class.from_pretrained(
+                self.model,
+                torch_dtype=self.torch_dtype,
+                **self.hub_kwargs,
+            )
+
+    def load_ortmodel(self) -> None:
+        LOGGER.info("\t+ Loading ORTModel")
+        self.pretrained_model = self.ortmodel_class.from_pretrained(
+            self.model,
+            export=self.export,
+            provider=self.config.provider,
+            session_options=self.session_options,
+            use_io_binding=self.config.use_io_binding,
+            provider_options=self.config.provider_options,
+            **self.ortmodel_kwargs,
+            **self.hub_kwargs,
+        )
+        # exported or not, the onnx model is/was here
+        self.model = self.pretrained_model.model_save_dir
+
+    @property
+    def ortmodel_kwargs(self) -> Dict[str, Any]:
+        if self.is_text_generation_model():
+            return {"use_cache": self.config.use_cache, "use_merged": self.use_merged}
+        else:
+            return {}
+
+    @property
+    def true_task(self) -> str:
+        return self.task + "-with-past" if self.config.use_cache and self.is_text_generation_model() else self.task
+
+    def export_automodel(self) -> None:
+        LOGGER.info("\t+ Exporting AutoModel to ONNX")
+        exported_model_dir = f"{self.tmpdir.name}/exported_model"
+        self.merging_config, self.models_and_onnx_configs = main_export(
+            self.model,
+            output=exported_model_dir,
+            task=self.true_task,
+            device=self.device.type,
+            fp16=self.torch_dtype == torch.float16,
+            **self.hub_kwargs,
+            # we hijack the model instantiation and use our random weights model
+            model=self.pretrained_model,
+        )
+        self.model = exported_model_dir
+
+    def merge_onnx_files(self) -> None:
+        LOGGER.info("\t+ Post-processing the exported model")
+        self.merging_config.post_process_exported_models(self.model, self.models_and_onnx_configs, None)
+
+    @property
+    def onnx_files_names(self):
+        assert os.path.isdir(self.model), f"{self.model} is not a directory"
+        return [file for file in os.listdir(self.model) if file.endswith(".onnx")]
+
+    def optimize_onnx_files(self) -> None:
+        LOGGER.info("\t+ Attempting optimization")
+        optimized_model_path = f"{self.tmpdir.name}/optimized"
+        LOGGER.info("\t+ Processing optimization config")
+        if self.config.auto_optimization is not None:
+            optimization_config = AutoOptimizationConfig.with_optimization_level(
+                optimization_level=self.config.auto_optimization, **self.config.auto_optimization_config
+            )
+        elif self.config.optimization:
+            optimization_config = OptimizationConfig(**self.config.optimization_config)
+        LOGGER.info("\t+ Creating optimizer")
+        optimizer = ORTOptimizer.from_pretrained(self.model, file_names=self.onnx_files_names)
+        LOGGER.info("\t+ Optimizing ORTModel")
+        optimizer.optimize(
+            optimization_config,
+            save_dir=optimized_model_path,
+            file_suffix="",
+            # TODO: add support for these
+            use_external_data_format=None,
+            one_external_file=True,
+        )
+        self.model = optimized_model_path
+
+    @property
+    def onnx_files_names_to_quantize(self):
+        assert os.path.isdir(self.model), f"{self.model} is not a directory"
+        if self.config.use_merged:
+            # we filter merging components since they're not used for inference
+            # this also allows for calibration of one merged component models (like gpt2)
+            return [
+                model
+                for model in self.onnx_files_names
+                if model not in [ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME]
+            ]
+        else:
+            return self.onnx_files_names
+
+    def quantize_onnx_files(self) -> None:
+        LOGGER.info("\t+ Attempting quantization")
+        quantized_model_path = f"{self.tmpdir.name}/quantized"
+        LOGGER.info("\t+ Processing quantization config")
+        if self.config.calibration and len(self.onnx_files_names_to_quantize) > 1:
+            raise NotImplementedError("Calibration is not supported for models with multiple components")
+        if self.config.auto_quantization is not None:
+            self.config.auto_quantization_config = format_quantization_config(self.config.auto_quantization_config)
+            auto_quantization_config_class = getattr(AutoQuantizationConfig, self.config.auto_quantization)
+            quantization_config = auto_quantization_config_class(**self.config.auto_quantization_config)
+        elif self.config.quantization:
+            self.config.quantization_config = format_quantization_config(self.config.quantization_config)
+            quantization_config = QuantizationConfig(**self.config.quantization_config)
+        LOGGER.info(f"\t+ Model has {len(self.onnx_files_names_to_quantize)} components to quantize")
+        if len(self.onnx_files_names_to_quantize) == 1:
+            LOGGER.info("\t+ Creating quantizer")
+            quantizer = ORTQuantizer.from_pretrained(self.model, file_name=self.onnx_files_names_to_quantize[0])
+            if self.config.calibration:
+                LOGGER.info("\t+ Processing calibration config")
+                preprocess_class = get_class(self.config.calibration_config.pop("preprocess_class"))
+                self.config.calibration_config["preprocess_function"] = preprocess_class(model_name_or_path=self.model)
+                LOGGER.info("\t+ Loading calibration dataset")
+                calibration_dataset = quantizer.get_calibration_dataset(**self.config.calibration_config)
+                LOGGER.info("\t+ Creating calibration config")
+                calibration_config = AutoCalibrationConfig.minmax(calibration_dataset)
+                LOGGER.info("\t+ Fitting calibration tensors range")
+                calibration_tensors_range = quantizer.fit(
+                    dataset=calibration_dataset,
+                    calibration_config=calibration_config,
+                    operators_to_quantize=quantization_config.operators_to_quantize,
+                    use_gpu=self.device.type == "cuda",
+                    # TODO: add support for these
+                    batch_size=1,
+                    use_external_data_format=False,
+                    force_symmetric_range=False,
+                )
+            else:
+                calibration_tensors_range = None
+            LOGGER.info("\t+ Quantizing model")
+            quantizer.quantize(
+                save_dir=quantized_model_path,
+                quantization_config=quantization_config,
+                calibration_tensors_range=calibration_tensors_range,
+                # TODO: add support for these
+                use_external_data_format=False,
+                preprocessor=None,
+            )
+        else:
+            for onnx_file_name_to_quantize in self.onnx_files_names_to_quantize:
+                LOGGER.info(f"\t+ Creating quantizer for {onnx_file_name_to_quantize}")
+                quantizer = ORTQuantizer.from_pretrained(self.model, file_name=onnx_file_name_to_quantize)
+                LOGGER.info(f"\t+ Quantizing {onnx_file_name_to_quantize}")
+                quantizer.quantize(
+                    save_dir=quantized_model_path,
+                    quantization_config=quantization_config,
+                    calibration_tensors_range=None,
+                    file_suffix="",
+                    # TODO: add support for these
+                    use_external_data_format=False,
+                    preprocessor=None,
+                )
+        self.model = quantized_model_path
+
+    def prepare_for_profiling(self, input_names: List[str]) -> None:
+        LOGGER.info("Preparing model for profiling")
+        LOGGER.info("\t+ Wrapping model inside profiler")
+        self.pretrained_model = ORTProfilingWrapper(self.pretrained_model)
+
+    def train(
+        self,
+        training_dataset: "Dataset",
+        training_data_collator: Callable,
+        training_arguments: Dict[str, Any],
+        training_callbacks: List["TrainerCallback"],
+    ) -> "TrainerState":
+        LOGGER.info("\t+ Setting dataset format to `torch`")
+        training_dataset.set_format(type="torch", columns=list(training_dataset.features.keys()))
+        LOGGER.info("\t+ Wrapping training arguments with optimum.onnxruntime.ORTTrainingArguments")
+        training_arguments = ORTTrainingArguments(**training_arguments)
+        LOGGER.info("\t+ Wrapping model with optimum.onnxruntime.ORTTrainer")
+        trainer = ORTTrainer(
+            model=self.pretrained_model,
+            feature=self.task,
+            args=training_arguments,
+            data_collator=training_data_collator,
+            train_dataset=training_dataset,
+            callbacks=training_callbacks,
+            # TODO: add support for optimizers
+            optimizers=(None, None),
+        )
+        LOGGER.info("\t+ Launching training")
+        trainer.train()
+        LOGGER.info("\t+ Training finished successfully")
+        trainer_state = trainer.state
+
+        return trainer_state
+
+    def clean(self) -> None:
+        super().clean()
+        if hasattr(self, "tmpdir"):
+            self.tmpdir.cleanup()
diff --git a/optimum_benchmark/backends/onnxruntime/config.py b/optimum_benchmark/backends/onnxruntime/config.py
new file mode 100644
index 000000000..decf5d482
--- /dev/null
+++ b/optimum_benchmark/backends/onnxruntime/config.py
@@ -0,0 +1,185 @@
+import importlib.metadata
+import importlib.util
+from dataclasses import dataclass, field
+from typing import Any, Dict, Optional
+
+from omegaconf import OmegaConf
+
+from ..base import BackendConfig
+from .utils import infer_device_id
+
+
+def onnxruntime_version():
+    try:
+        return "ort:" + importlib.metadata.version("onnxruntime")
+    except importlib.metadata.PackageNotFoundError:
+        try:
+            return "ort-gpu:" + importlib.metadata.version("onnxruntime-gpu")
+        except importlib.metadata.PackageNotFoundError:
+            return "ort:unknown"
+
+
+OmegaConf.register_new_resolver(
+    "is_gpu",
+    lambda device: "cuda" in device.lower(),
+)
+OmegaConf.register_new_resolver(
+    "is_profiling",
+    lambda benchmark_name: benchmark_name == "profiling",
+)
+OmegaConf.register_new_resolver(
+    "infer_provider",
+    lambda device: "CPUExecutionProvider" if device == "cpu" else "CUDAExecutionProvider",
+)
+OmegaConf.register_new_resolver(
+    "infer_device_id",
+    lambda device: infer_device_id(device),
+)
+OmegaConf.register_new_resolver(
+    "onnxruntime_version",
+    lambda: onnxruntime_version(),
+)
+
+OPTIMIZATION_CONFIG = {
+    "optimization_level": 1,  # 0, 1, 2, 99
+    "optimize_for_gpu": "${is_gpu:${device}}",
+    "fp16": False,
+    "enable_transformers_specific_optimizations": True,
+    "enable_gelu_approximation": False,
+    "disable_gelu_fusion": False,
+    "disable_layer_norm_fusion": False,
+    "disable_attention_fusion": False,
+    "disable_skip_layer_norm_fusion": True,
+    "disable_bias_skip_layer_norm_fusion": False,
+    "disable_bias_gelu_fusion": False,
+    "use_mask_index": False,
+    "no_attention_mask": False,
+    "disable_embed_layer_norm_fusion": True,
+    "disable_shape_inference": False,
+    "use_multi_head_attention": False,
+    "enable_gemm_fast_gelu_fusion": False,
+    "use_raw_attention_mask": False,
+    "disable_group_norm_fusion": True,
+    "disable_packed_kv": True,
+}
+
+AUTO_OPTIMIZATION_CONFIG = {
+    "for_gpu": "${is_gpu:${device}}",
+    # full auto optimization config depends on the level so we keep it minimal
+}
+
+QUANTIZATION_CONFIG = {
+    "is_static": False,
+    "format": "QOperator",  # QOperator, QDQ
+    "mode": "IntegerOps",  # QLinearOps, IntegerOps
+    "activations_dtype": "QUInt8",  # QInt8, QUInt8
+    "activations_symmetric": False,
+    "weights_dtype": "QInt8",  # QInt8, QUInt8
+    "weights_symmetric": True,
+    "per_channel": False,
+    "reduce_range": False,
+    "operators_to_quantize": [
+        "MatMul",
+        "Add",
+    ],
+}
+
+AUTO_QUANTIZATION_CONFIG = {
+    "is_static": False,
+    # full auto quantization config depends on the strategy so we keep it minimal
+}
+
+CALIBRATION_CONFIG = {
+    "dataset_name": "glue",
+    "num_samples": 300,
+    "dataset_config_name": "sst2",
+    "dataset_split": "train",
+    "preprocess_batch": True,
+    "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor",
+}
+PROVIDER_OPTIONS = {"device_id": "${infer_device_id:${device}}"}
+SESSION_OPTIONS = {"enable_profiling": "${is_profiling:${benchmark.name}}"}
+
+
+@dataclass
+class ORTConfig(BackendConfig):
+    name: str = "onnxruntime"
+    version: str = "${onnxruntime_version:}"
+    _target_: str = "optimum_benchmark.backends.onnxruntime.backend.ORTBackend"
+
+    no_weights: bool = False
+
+    # export options
+    export: bool = True
+    use_cache: bool = True
+    use_merged: bool = False
+    torch_dtype: Optional[str] = None
+
+    # provider options
+    provider: str = "${infer_provider:${device}}"
+    device_id: Optional[int] = "${oc.deprecated:backend.provider_options.device_id}"
+    provider_options: Dict[str, Any] = field(default_factory=lambda: PROVIDER_OPTIONS)
+
+    # inference options
+    use_io_binding: bool = "${is_gpu:${device}}"
+    enable_profiling: bool = "${oc.deprecated:backend.session_options.enable_profiling}"
+    session_options: Dict[str, Any] = field(default_factory=lambda: SESSION_OPTIONS)
+
+    # optimization options
+    optimization: bool = False
+    optimization_config: Dict[str, Any] = field(default_factory=dict)
+
+    # quantization options
+    quantization: bool = False
+    quantization_config: Dict[str, Any] = field(default_factory=dict)
+
+    # calibration options
+    calibration: bool = False
+    calibration_config: Dict[str, Any] = field(default_factory=dict)
+
+    # null, O1, O2, O3, O4
+    auto_optimization: Optional[str] = None
+    auto_optimization_config: Dict[str, Any] = field(default_factory=dict)
+
+    # null, arm64, avx2, avx512, avx512_vnni, tensorrt
+    auto_quantization: Optional[str] = None
+    auto_quantization_config: Dict[str, Any] = field(default_factory=dict)
+
+    # ort-training is basically a different package so we might need to seperate these two backends in the future
+    use_inference_session: bool = "${is_inference:${benchmark.name}}"
+
+    def __post_init__(self):
+        if not self.no_weights and not self.export and self.torch_dtype is not None:
+            raise NotImplementedError("Can't convert an exported model's weights to a different dtype.")
+
+        if self.optimization:
+            self.optimization_config = OmegaConf.to_container(
+                OmegaConf.merge(OPTIMIZATION_CONFIG, self.optimization_config)
+            )
+        if self.quantization:
+            self.quantization_config = OmegaConf.to_container(
+                OmegaConf.merge(QUANTIZATION_CONFIG, self.quantization_config)
+            )
+            # raise ValueError if the quantization is static but calibration is not enabled
+            if self.quantization_config["is_static"] and not self.calibration:
+                raise ValueError(
+                    "Quantization is static but calibration is not enabled. Please enable calibration or disable static quantization."
+                )
+
+        if self.auto_optimization is not None:
+            self.auto_optimization_config = OmegaConf.to_container(
+                OmegaConf.merge(AUTO_OPTIMIZATION_CONFIG, self.auto_optimization_config)
+            )
+        if self.auto_quantization is not None:
+            self.auto_quantization_config = OmegaConf.to_container(
+                OmegaConf.merge(AUTO_QUANTIZATION_CONFIG, self.auto_quantization_config)
+            )
+            if self.auto_quantization_config["is_static"] and not self.calibration:
+                raise ValueError(
+                    "Quantization is static but calibration is not enabled. Please enable calibration or disable static quantization."
+                )
+
+        if self.calibration:
+            self.calibration_config = OmegaConf.to_container(
+                OmegaConf.merge(CALIBRATION_CONFIG, self.calibration_config)
+            )
diff --git a/optimum_benchmark/backends/onnxruntime/utils.py b/optimum_benchmark/backends/onnxruntime/utils.py
new file mode 100644
index 000000000..be63fef8a
--- /dev/null
+++ b/optimum_benchmark/backends/onnxruntime/utils.py
@@ -0,0 +1,40 @@
+from typing import Any, Dict
+
+from onnxruntime.quantization import QuantFormat, QuantizationMode, QuantType
+from optimum.pipelines import ORT_SUPPORTED_TASKS
+
+TASKS_TO_ORTSD = {
+    "stable-diffusion": "optimum.onnxruntime.ORTStableDiffusionPipeline",
+    "stzble-diffusion-xl": "optimum.onnxruntime.ORTStableDiffusionXLPipeline",
+}
+
+TASKS_TO_ORTMODELS = {task: task_dict["class"][0] for task, task_dict in ORT_SUPPORTED_TASKS.items()}
+
+
+def infer_device_id(device: str) -> int:
+    """Infer the device id from the given device string."""
+    if device == "cuda":
+        # torch.cuda.current_device() will always return 0
+        # unless torch.cuda.set_device() is called somewhere
+        return 0
+    elif "cuda" in device:
+        return int(device.split(":")[1])
+    elif device == "cpu":
+        return -1
+    else:
+        raise ValueError(f"Unknown device: {device}")
+
+
+def format_quantization_config(quantization_config: Dict[str, Any]) -> None:
+    """Format the quantization dictionary for onnxruntime."""
+    # the conditionals are here because some quantization strategies don't have all the options
+    if quantization_config.get("format", None) is not None:
+        quantization_config["format"] = QuantFormat.from_string(quantization_config["format"])
+    if quantization_config.get("mode", None) is not None:
+        quantization_config["mode"] = QuantizationMode.from_string(quantization_config["mode"])
+    if quantization_config.get("activations_dtype", None) is not None:
+        quantization_config["activations_dtype"] = QuantType.from_string(quantization_config["activations_dtype"])
+    if quantization_config.get("weights_dtype", None) is not None:
+        quantization_config["weights_dtype"] = QuantType.from_string(quantization_config["weights_dtype"])
+
+    return quantization_config
diff --git a/optimum_benchmark/backends/openvino.py b/optimum_benchmark/backends/openvino.py
deleted file mode 100644
index 6e83ed756..000000000
--- a/optimum_benchmark/backends/openvino.py
+++ /dev/null
@@ -1,190 +0,0 @@
-from typing import Dict, Optional, Any, TYPE_CHECKING
-from tempfile import TemporaryDirectory
-from dataclasses import dataclass
-from logging import getLogger
-
-
-import torch
-import inspect
-from torch import Tensor
-from omegaconf import OmegaConf
-from hydra.utils import get_class
-from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS
-from openvino.runtime import __version__ as openvino_version
-from optimum.intel import OVConfig as OVQuantizationConfig, OVQuantizer
-
-if TYPE_CHECKING:
-    from transformers.modeling_outputs import ModelOutput
-
-
-from .base import Backend, BackendConfig
-from .utils.openvino_utils import (
-    DEFAULT_QUANTIZATION_CONFIG,
-    DEFAULT_CALIBRATION_CONFIG,
-)
-
-
-LOGGER = getLogger("openvino")
-
-
-@dataclass
-class OVConfig(BackendConfig):
-    name: str = "openvino"
-    version: str = openvino_version
-    _target_: str = "optimum_benchmark.backends.openvino.OVBackend"
-
-    # export options
-    export: bool = True
-    no_weights: bool = False
-    use_merged: Optional[bool] = None
-    torch_dtype: Optional[str] = None
-
-    # compiling options
-    reshape: bool = False
-    half: bool = False
-
-    # quantization options
-    quantization: bool = False
-    quantization_config: Optional[Dict[str, Any]] = None
-
-    # calibration options
-    calibration: bool = True
-    calibration_config: Optional[Dict[str, Any]] = None
-
-    def __post_init__(self):
-        assert self.torch_dtype is None or self.torch_dtype == "float32", (
-            "Only float32 is supported for torch_dtype in openvino backend. "
-            f"Got {self.torch_dtype}"
-        )
-
-        if self.quantization:
-            self.quantization_config = OmegaConf.merge(
-                self.quantization_config or {},
-                DEFAULT_QUANTIZATION_CONFIG,
-            )
-
-        if self.calibration:
-            self.calibration_config = OmegaConf.merge(
-                self.calibration_config or {},
-                DEFAULT_CALIBRATION_CONFIG,
-            )
-
-
-class OVBackend(Backend):
-    name: str = "openvino"
-    config: OVConfig
-
-    def __init__(
-        self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any]
-    ) -> None:
-        super().__init__(model, task, device, hub_kwargs)
-        self.device = torch.device(device)
-
-        self.ovmodel_class = get_class(
-            f"optimum.intel.openvino.{_HEAD_TO_AUTOMODELS[self.task]}"
-        )
-
-        LOGGER.info(
-            f"\t+ Infered OVModel class {self.ovmodel_class.__name__} "
-            f"for task {self.task} and model_type {self.model_type}"
-        )
-
-    def configure(self, config: OVConfig) -> None:
-        super().configure(config)
-
-        # Set torch dtype
-        self.config.torch_dtype = (
-            getattr(torch, self.config.torch_dtype)
-            if self.config.torch_dtype is not None
-            else None
-        )
-
-        if self.config.quantization:
-            self.config.quantization_config = OVQuantizationConfig(
-                **self.config.quantization_config,
-            )
-
-        with TemporaryDirectory() as tmpdirname:
-            if self.config.no_weights:
-                raise NotImplementedError(
-                    "no_weights is not supported for openvino backend"
-                )
-            else:
-                self.load_model_from_pretrained()
-
-            if self.config.quantization:
-                self.quantize(tmpdirname)
-
-    def load_model_from_pretrained(self) -> None:
-        self.pretrained_model = self.ovmodel_class.from_pretrained(
-            model_id=self.model,
-            use_merged=self.config.use_merged,
-            export=self.config.export,
-            **self.hub_kwargs,
-        )
-
-    def quantize(self, tmpdirname: str) -> None:
-        LOGGER.info("\t+ Attempting quantization")
-
-        model = self.automodel_class.from_pretrained(self.model, **self.hub_kwargs)
-        quantizer = OVQuantizer.from_pretrained(model)
-
-        preprocess_class = get_class(self.config.calibration_config.preprocess_class)
-        preprocess_function = preprocess_class(model_name_or_path=self.model)
-
-        calibration_dataset = quantizer.get_calibration_dataset(
-            dataset_name=self.config.calibration_config.dataset_name,
-            num_samples=self.config.calibration_config.num_samples,
-            dataset_config_name=self.config.calibration_config.dataset_config_name,
-            dataset_split=self.config.calibration_config.dataset_split,
-            preprocess_function=preprocess_function,
-        )
-
-        quantizer.quantize(
-            calibration_dataset=calibration_dataset,
-            save_directory=f"{tmpdirname}/quantized",
-            quantization_config=self.config.quantization_config,
-            # defaults
-            batch_size=1,
-            data_collator=None,
-            remove_unused_columns=True,
-            weights_only=False,
-        )
-        self.delete_pretrained_model()
-
-        LOGGER.info("\t+ Loading quantized model")
-        self.pretrained_model = self.ovmodel_class.from_pretrained(
-            model_id=f"{tmpdirname}/quantized",
-            use_merged=self.config.use_merged,
-        )
-
-    def prepare_for_inference(self, input_shapes: Dict[str, int]) -> None:
-        if self.config.reshape:
-            static_shapes = {
-                key: value
-                for key, value in input_shapes.items()
-                if key in inspect.getfullargspec(self.pretrained_model.reshape).args
-            }
-            LOGGER.info(f"\t+ Reshaping model with static shapes: {static_shapes}")
-            self.pretrained_model.reshape(**static_shapes)
-
-        if self.config.half:
-            LOGGER.info("\t+ Converting model to half precision")
-            self.pretrained_model.half()
-
-        if self.config.reshape or self.config.half:
-            LOGGER.info("\t+ Compiling model")
-            self.pretrained_model.compile()
-
-    def forward(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput":
-        output = self.pretrained_model(**input, **kwargs)
-
-        return output
-
-    def generate(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput":
-        output = self.pretrained_model.generate(**input, **kwargs)
-
-        return output
-
-    def train(self, **kwargs) -> None:
-        pass
diff --git a/optimum_benchmark/backends/openvino/__init__.py b/optimum_benchmark/backends/openvino/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/optimum_benchmark/backends/openvino/backend.py b/optimum_benchmark/backends/openvino/backend.py
new file mode 100644
index 000000000..28b354c54
--- /dev/null
+++ b/optimum_benchmark/backends/openvino/backend.py
@@ -0,0 +1,119 @@
+import inspect
+from logging import getLogger
+from tempfile import TemporaryDirectory
+from typing import Any, Dict
+
+from hydra.utils import get_class
+from optimum.intel.openvino import OVConfig as OVQuantizationConfig  # naming conflict
+from optimum.intel.openvino import OVQuantizer
+
+from ..base import Backend
+from .config import OVConfig
+from .utils import TASKS_TO_OVMODEL
+
+LOGGER = getLogger("openvino")
+
+
+class OVBackend(Backend[OVConfig]):
+    NAME: str = "openvino"
+
+    def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any]) -> None:
+        super().__init__(model, task, device, hub_kwargs)
+        self.validate_device()
+        self.validate_task()
+
+        self.ovmodel_class = get_class(TASKS_TO_OVMODEL[self.task])
+        ortmodel_name = self.ovmodel_class.__name__
+        LOGGER.info(f"\t+ Infered OVModel class {ortmodel_name} for task {self.task} and model_type {self.model_type}")
+
+    def validate_task(self) -> None:
+        if self.task not in TASKS_TO_OVMODEL:
+            raise NotImplementedError(f"OVBackend does not support task {self.task}")
+
+    def validate_device(self) -> None:
+        if self.device.type != "cpu":
+            raise ValueError(f"OVBackend only supports CPU devices, got {self.device.type}")
+
+    def configure(self, config: OVConfig) -> None:
+        super().configure(config)
+
+        self.tmpdir = TemporaryDirectory()
+
+        if self.config.quantization:
+            self.load_automodel()
+            self.quantize_automodel()
+            self.delete_pretrained_model()  # deletes automodel
+            self.export = False  # quantized model is already exported
+        else:
+            self.export = self.config.export  # to not change the config's values
+
+        self.load_ovmodel()
+        self.tmpdir.cleanup()
+
+    def load_automodel(self) -> None:
+        self.pretrained_model = self.automodel_class.from_pretrained(self.model, **self.hub_kwargs)
+
+    @property
+    def ovmodel_kwargs(self) -> Dict[str, Any]:
+        if self.is_text_generation_model():
+            return {"use_cache": self.config.use_cache, "use_merged": self.config.use_merged}
+        else:
+            return {}
+
+    def load_ovmodel(self) -> None:
+        self.pretrained_model = self.ovmodel_class.from_pretrained(
+            self.model,
+            export=self.export,
+            **self.ovmodel_kwargs,
+            **self.hub_kwargs,
+        )
+
+    def quantize_automodel(self) -> None:
+        LOGGER.info("\t+ Attempting quantization")
+        quantized_model_path = f"{self.tmpdir.name}/quantized"
+        LOGGER.info("\t+ Processing quantization config")
+        quantization_config = OVQuantizationConfig(**self.config.quantization_config)
+        LOGGER.info("\t+ Creating quantizer")
+        quantizer = OVQuantizer.from_pretrained(self.pretrained_model, task=self.task, seed=self.config.seed)
+        LOGGER.info("\t+ Processing calibration config")
+        calibration_config = self.config.calibration_config.copy()
+        preprocess_class = get_class(calibration_config.pop("preprocess_class"))
+        calibration_config["preprocess_function"] = preprocess_class(model_name_or_path=self.model)
+        LOGGER.info("\t+ Loading calibration dataset")
+        calibration_dataset = quantizer.get_calibration_dataset(**calibration_config)
+        LOGGER.info("\t+ Quantizing model")
+        quantizer.quantize(
+            quantization_config=quantization_config,
+            save_directory=quantized_model_path,
+            calibration_dataset=calibration_dataset,
+            # TODO: add support for these
+            remove_unused_columns=True,
+            data_collator=None,
+            weights_only=False,
+            file_name=None,
+            batch_size=1,
+        )
+        self.model = quantized_model_path
+
+    def prepare_for_inference(self, input_shapes: Dict[str, int]) -> None:
+        if self.config.reshape:
+            static_shapes = {
+                key: value
+                for key, value in input_shapes.items()
+                if key in inspect.getfullargspec(self.pretrained_model.reshape).args
+            }
+            LOGGER.info(f"\t+ Reshaping model with static shapes: {static_shapes}")
+            self.pretrained_model.reshape(**static_shapes)
+
+        if self.config.half:
+            LOGGER.info("\t+ Converting model to half precision")
+            self.pretrained_model.half()
+
+        if self.config.reshape or self.config.half:
+            LOGGER.info("\t+ Compiling model")
+            self.pretrained_model.compile()
+
+    def clean(self) -> None:
+        super().clean()
+        if hasattr(self, "tmpdir"):
+            self.tmpdir.cleanup()
diff --git a/optimum_benchmark/backends/openvino/config.py b/optimum_benchmark/backends/openvino/config.py
new file mode 100644
index 000000000..e54c2aefd
--- /dev/null
+++ b/optimum_benchmark/backends/openvino/config.py
@@ -0,0 +1,64 @@
+import importlib.metadata
+from dataclasses import dataclass, field
+from typing import Any, Dict
+
+from omegaconf import OmegaConf
+
+from ..base import BackendConfig
+
+OmegaConf.register_new_resolver(
+    "openvino_version",
+    lambda: importlib.metadata.version("openvino"),
+)
+
+# https://github.com/huggingface/optimum-intel/blob/main/optimum/intel/openvino/configuration.py#L81
+QUANTIZATION_CONFIG = {
+    "compression": None,
+    "input_info": None,
+    "save_onnx_model": False,
+}
+
+CALIBRATION_CONFIG = {
+    "dataset_name": "glue",
+    "num_samples": 300,
+    "dataset_config_name": "sst2",
+    "dataset_split": "train",
+    "preprocess_batch": True,
+    "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor",
+}
+
+
+@dataclass
+class OVConfig(BackendConfig):
+    name: str = "openvino"
+    version: str = "${openvino_version:}"
+    _target_: str = "optimum_benchmark.backends.openvino.backend.OVBackend"
+
+    # export options
+    export: bool = True
+    use_cache: bool = True
+    use_merged: bool = False
+
+    # compiling options
+    reshape: bool = False
+    half: bool = False
+
+    # quantization options
+    quantization: bool = False
+    quantization_config: Dict[str, Any] = field(default_factory=dict)
+
+    # calibration options
+    calibration: bool = False
+    calibration_config: Dict[str, Any] = field(default_factory=dict)
+
+    def __post_init__(self):
+        if self.quantization:
+            self.quantization_config = OmegaConf.to_container(
+                OmegaConf.merge(QUANTIZATION_CONFIG, self.quantization_config)
+            )
+            if not self.calibration:
+                raise ValueError("OpenVINO quantization requires enabling calibration.")
+            else:
+                self.calibration_config = OmegaConf.to_container(
+                    OmegaConf.merge(CALIBRATION_CONFIG, self.calibration_config)
+                )
diff --git a/optimum_benchmark/backends/openvino/utils.py b/optimum_benchmark/backends/openvino/utils.py
new file mode 100644
index 000000000..4c13891e5
--- /dev/null
+++ b/optimum_benchmark/backends/openvino/utils.py
@@ -0,0 +1,3 @@
+from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS
+
+TASKS_TO_OVMODEL = {task: f"optimum.intel.openvino.{ovmodel}" for task, ovmodel in _HEAD_TO_AUTOMODELS.items()}
diff --git a/optimum_benchmark/backends/utils/optimum_utils.py b/optimum_benchmark/backends/optimum_utils.py
similarity index 67%
rename from optimum_benchmark/backends/utils/optimum_utils.py
rename to optimum_benchmark/backends/optimum_utils.py
index a558f1659..a064cba08 100644
--- a/optimum_benchmark/backends/utils/optimum_utils.py
+++ b/optimum_benchmark/backends/optimum_utils.py
@@ -1,31 +1,29 @@
-from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
-from pathlib import Path
 import os
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
 
 import torch
 from optimum.exporters.onnx.__main__ import (
-    logger,
-    TasksManager,
-    OnnxConfigWithPast,
-    _get_submodels_and_onnx_configs,
-    maybe_save_preprocessors,
-    validate_models_outputs,
-    is_torch_available,
-    export_models,
-    AutoTokenizer,
     DEFAULT_DUMMY_SHAPES,
     ONNX_WEIGHTS_NAME,
-    UNPICKABLE_ARCHS,
+    # UNPICKABLE_ARCHS,
+    # AtolError,
+    AutoTokenizer,
+    OnnxConfigWithPast,
+    # OutputMatchError,
     RequestsConnectionError,
-    OutputMatchError,
-    ShapeError,
-    AtolError,
+    # ShapeError,
+    TasksManager,
+    _get_submodels_and_onnx_configs,
+    export_models,
+    is_torch_available,
+    logger,
+    maybe_save_preprocessors,
 )
 
-
 if TYPE_CHECKING:
-    from transformers import PreTrainedModel
     from optimum.exporters.onnx import OnnxConfig
+    from transformers import PreTrainedModel
 
 
 # rewrite of the main_export function from optimum.exporters.onnx.__main__
@@ -39,7 +37,7 @@ def main_export(
     fp16: Optional[bool] = False,
     optimize: Optional[str] = None,
     monolith: bool = False,
-    no_post_process: bool = False,
+    # no_post_process: bool = False,
     framework: Optional[str] = None,
     atol: Optional[float] = None,
     cache_dir: Optional[str] = None,
@@ -51,11 +49,11 @@ def main_export(
     local_files_only: bool = False,
     use_auth_token: Optional[Union[bool, str]] = None,
     for_ort: bool = False,
-    do_validation: bool = True,
+    # do_validation: bool = True,
     model_kwargs: Optional[Dict[str, Any]] = None,
     custom_onnx_configs: Optional[Dict[str, "OnnxConfig"]] = None,
     fn_get_submodels: Optional[Callable] = None,
-    use_subprocess: bool = False,
+    # use_subprocess: bool = False,
     ########################################
     model: Optional["PreTrainedModel"] = None,
     ########################################
@@ -88,17 +86,13 @@ def main_export(
     original_task = task
     task = TasksManager.map_from_synonym(task)
 
-    framework = TasksManager.determine_framework(
-        model_name_or_path, subfolder=subfolder, framework=framework
-    )
+    framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework)
 
     # get the shapes to be used to generate dummy inputs
     input_shapes = {}
     for input_name in DEFAULT_DUMMY_SHAPES.keys():
         input_shapes[input_name] = (
-            kwargs_shapes[input_name]
-            if input_name in kwargs_shapes
-            else DEFAULT_DUMMY_SHAPES[input_name]
+            kwargs_shapes[input_name] if input_name in kwargs_shapes else DEFAULT_DUMMY_SHAPES[input_name]
         )
 
     torch_dtype = None if fp16 is False else torch.float16
@@ -133,11 +127,7 @@ def main_export(
 
     custom_architecture = False
     is_stable_diffusion = "stable-diffusion" in task
-    model_type = (
-        "stable-diffusion"
-        if is_stable_diffusion
-        else model.config.model_type.replace("_", "-")
-    )
+    model_type = "stable-diffusion" if is_stable_diffusion else model.config.model_type.replace("_", "-")
 
     if not is_stable_diffusion:
         if model_type in TasksManager._UNSUPPORTED_CLI_MODEL_TYPE:
@@ -145,9 +135,9 @@ def main_export(
                 f"{model_type} is not supported yet. Only {TasksManager._SUPPORTED_CLI_MODEL_TYPE} are supported. "
                 f"If you want to support {model_type} please propose a PR or open up an issue."
             )
-        if model.config.model_type.replace(
-            "-", "_"
-        ) not in TasksManager.get_supported_model_type_for_task(task, exporter="onnx"):
+        if model.config.model_type.replace("-", "_") not in TasksManager.get_supported_model_type_for_task(
+            task, exporter="onnx"
+        ):
             custom_architecture = True
 
     # TODO: support onnx_config.py in the model repo
@@ -164,12 +154,9 @@ def main_export(
     if (
         not custom_architecture
         and not is_stable_diffusion
-        and task + "-with-past"
-        in TasksManager.get_supported_tasks_for_model_type(model_type, "onnx")
+        and task + "-with-past" in TasksManager.get_supported_tasks_for_model_type(model_type, "onnx")
     ):
-        if (
-            original_task == "auto"
-        ):  # Make -with-past the default if --task was not explicitely specified
+        if original_task == "auto":  # Make -with-past the default if --task was not explicitely specified
             task = task + "-with-past"
         else:
             logger.info(
@@ -197,9 +184,7 @@ def main_export(
         model=model,
         task=task,
         monolith=monolith,
-        custom_onnx_configs=custom_onnx_configs
-        if custom_onnx_configs is not None
-        else {},
+        custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {},
         custom_architecture=custom_architecture,
         fn_get_submodels=fn_get_submodels,
     )
@@ -257,15 +242,10 @@ def main_export(
             subcomponent = models_and_onnx_configs[model_name][0]
             if hasattr(subcomponent, "save_config"):
                 subcomponent.save_config(output / model_name)
-            elif hasattr(subcomponent, "config") and hasattr(
-                subcomponent.config, "save_pretrained"
-            ):
+            elif hasattr(subcomponent, "config") and hasattr(subcomponent.config, "save_pretrained"):
                 subcomponent.config.save_pretrained(output / model_name)
 
-        onnx_files_subpaths = [
-            os.path.join(name_dir, ONNX_WEIGHTS_NAME)
-            for name_dir in models_and_onnx_configs
-        ]
+        onnx_files_subpaths = [os.path.join(name_dir, ONNX_WEIGHTS_NAME) for name_dir in models_and_onnx_configs]
 
         # Saving the additional components needed to perform inference.
         model.scheduler.save_pretrained(output.joinpath("scheduler"))
@@ -294,77 +274,83 @@ def main_export(
         dtype="fp16" if fp16 is True else None,
         model_kwargs=model_kwargs,
     )
-
-    if optimize is not None:
-        from optimum.onnxruntime.configuration import AutoOptimizationConfig
-        from optimum.onnxruntime import ORTOptimizer
-
-        if onnx_files_subpaths is None:
-            onnx_files_subpaths = [
-                key + ".onnx" for key in models_and_onnx_configs.keys()
-            ]
-        optimizer = ORTOptimizer.from_pretrained(output, file_names=onnx_files_subpaths)
-
-        optimization_config = AutoOptimizationConfig.with_optimization_level(
-            optimization_level=optimize
-        )
-
-        optimization_config.disable_shape_inference = True
-        optimizer.optimize(
-            save_dir=output, optimization_config=optimization_config, file_suffix=""
-        )
-
-    # Optionally post process the obtained ONNX file(s), for example to merge the decoder / decoder with past if any
-    # TODO: treating stable diffusion separately is quite ugly
-    if not no_post_process and not is_stable_diffusion:
-        try:
-            logger.info("Post-processing the exported models...")
-            (
-                models_and_onnx_configs,
-                onnx_files_subpaths,
-            ) = onnx_config.post_process_exported_models(
-                output, models_and_onnx_configs, onnx_files_subpaths
-            )
-        except Exception as e:
-            raise Exception(
-                f"The post-processing of the ONNX export failed. The export can still be performed by passing the option --no-post-process. Detailed error: {e}"
-            )
-
-    if is_stable_diffusion:
-        use_subprocess = False  # TODO: fix Can't pickle local object 'get_stable_diffusion_models_for_export.<locals>.<lambda>'
-    elif model.config.model_type in UNPICKABLE_ARCHS:
-        # Pickling is bugged for nn.utils.weight_norm: https://github.com/pytorch/pytorch/issues/102983
-        # TODO: fix "Cowardly refusing to serialize non-leaf tensor" error for wav2vec2-conformer
-        use_subprocess = False
-
-    if do_validation is True:
-        try:
-            validate_models_outputs(
-                models_and_onnx_configs=models_and_onnx_configs,
-                onnx_named_outputs=onnx_outputs,
-                atol=atol,
-                output_dir=output,
-                onnx_files_subpaths=onnx_files_subpaths,
-                input_shapes=input_shapes,
-                device=device,
-                dtype=torch_dtype,
-                use_subprocess=use_subprocess,
-                model_kwargs=model_kwargs,
-            )
-            logger.info(
-                f"The ONNX export succeeded and the exported model was saved at: {output.as_posix()}"
-            )
-        except ShapeError as e:
-            raise e
-        except AtolError as e:
-            logger.warning(
-                f"The ONNX export succeeded with the warning: {e}.\n The exported model was saved at: {output.as_posix()}"
-            )
-        except OutputMatchError as e:
-            logger.warning(
-                f"The ONNX export succeeded with the warning: {e}.\n The exported model was saved at: {output.as_posix()}"
-            )
-        except Exception as e:
-            raise Exception(
-                f"An error occured during validation, but the model was saved nonetheless at {output.as_posix()}. Detailed error: {e}."
-            )
+    # for the post processing later we don't wanna keep models
+    if len(models_and_onnx_configs) == 2:
+        models_and_onnx_configs = {
+            "decoder_model": ("dummy_decoder_model_object", models_and_onnx_configs["decoder_model"][1]),
+            "decoder_with_past_model": (
+                "dummy_decoder_with_past_model_object",
+                models_and_onnx_configs["decoder_with_past_model"][1],
+            ),
+        }
+    else:
+        models_and_onnx_configs = {
+            "model": ("dummy_model", models_and_onnx_configs["model"][1]),
+        }
+
+    return onnx_config, models_and_onnx_configs
+
+    # if optimize is not None:
+    #     from optimum.onnxruntime import ORTOptimizer
+    #     from optimum.onnxruntime.configuration import AutoOptimizationConfig
+
+    #     if onnx_files_subpaths is None:
+    #         onnx_files_subpaths = [key + ".onnx" for key in models_and_onnx_configs.keys()]
+    #     optimizer = ORTOptimizer.from_pretrained(output, file_names=onnx_files_subpaths)
+
+    #     optimization_config = AutoOptimizationConfig.with_optimization_level(optimization_level=optimize)
+
+    #     optimization_config.disable_shape_inference = True
+    #     optimizer.optimize(save_dir=output, optimization_config=optimization_config, file_suffix="")
+
+    # # Optionally post process the obtained ONNX file(s), for example to merge the decoder / decoder with past if any
+    # # TODO: treating stable diffusion separately is quite ugly
+    # if not no_post_process and not is_stable_diffusion:
+    #     try:
+    #         logger.info("Post-processing the exported models...")
+    #         (models_and_onnx_configs, onnx_files_subpaths) = onnx_config.post_process_exported_models(
+    #             output, models_and_onnx_configs, onnx_files_subpaths
+    #         )
+    #     except Exception as e:
+    #         raise Exception(
+    #             f"The post-processing of the ONNX export failed. The export can still be performed by passing the option --no-post-process. Detailed error: {e}"
+    #         )
+
+    # if is_stable_diffusion:
+    #     use_subprocess = (
+    #         False  # TODO: fix Can't pickle local object 'get_stable_diffusion_models_for_export.<locals>.<lambda>'
+    #     )
+    # elif model.config.model_type in UNPICKABLE_ARCHS:
+    #     # Pickling is bugged for nn.utils.weight_norm: https://github.com/pytorch/pytorch/issues/102983
+    #     # TODO: fix "Cowardly refusing to serialize non-leaf tensor" error for wav2vec2-conformer
+    #     use_subprocess = False
+
+    # if do_validation is True:
+    #     try:
+    #         validate_models_outputs(
+    #             models_and_onnx_configs=models_and_onnx_configs,
+    #             onnx_named_outputs=onnx_outputs,
+    #             atol=atol,
+    #             output_dir=output,
+    #             onnx_files_subpaths=onnx_files_subpaths,
+    #             input_shapes=input_shapes,
+    #             device=device,
+    #             dtype=torch_dtype,
+    #             use_subprocess=use_subprocess,
+    #             model_kwargs=model_kwargs,
+    #         )
+    #         logger.info(f"The ONNX export succeeded and the exported model was saved at: {output.as_posix()}")
+    #     except ShapeError as e:
+    #         raise e
+    #     except AtolError as e:
+    #         logger.warning(
+    #             f"The ONNX export succeeded with the warning: {e}.\n The exported model was saved at: {output.as_posix()}"
+    #         )
+    #     except OutputMatchError as e:
+    #         logger.warning(
+    #             f"The ONNX export succeeded with the warning: {e}.\n The exported model was saved at: {output.as_posix()}"
+    #         )
+    #     except Exception as e:
+    #         raise Exception(
+    #             f"An error occured during validation, but the model was saved nonetheless at {output.as_posix()}. Detailed error: {e}."
+    #         )
diff --git a/optimum_benchmark/backends/pytorch.py b/optimum_benchmark/backends/pytorch.py
deleted file mode 100644
index b6c84f181..000000000
--- a/optimum_benchmark/backends/pytorch.py
+++ /dev/null
@@ -1,451 +0,0 @@
-from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING
-from dataclasses import dataclass
-from logging import getLogger
-import os
-import gc
-
-
-import torch
-from torch import Tensor
-from accelerate import init_empty_weights
-from omegaconf import DictConfig, OmegaConf
-from torch import __version__ as torch_version
-from transformers.utils.fx import symbolic_trace
-from transformers import Trainer, TrainingArguments
-from optimum.bettertransformer import BetterTransformer
-from transformers import BitsAndBytesConfig, GPTQConfig
-from torch.distributed.elastic.multiprocessing.errors import record
-from torch.distributed.launcher.api import elastic_launch, LaunchConfig
-
-
-if TYPE_CHECKING:
-    from datasets import Dataset
-    from transformers.utils import ModelOutput
-    from transformers import TrainerState, TrainerCallback
-
-
-from .base import Backend, BackendConfig
-from ..profilers.fx_profiler import FXProfilingWrapper
-from .utils.pytorch_utils import (
-    DEFAULT_COMPILE_CONFIG,
-    DEFAULT_DDP_CONFIG,
-    randomize_weights,
-    get_worker_logger,
-)
-
-
-# bachend logger
-LOGGER = getLogger("pytorch")
-
-# backend resolvers
-OmegaConf.register_new_resolver(
-    "is_inference", lambda benchmark_name: benchmark_name == "inference"
-)
-
-
-@dataclass
-class PyTorchConfig(BackendConfig):
-    name: str = "pytorch"
-    version: str = torch_version
-    _target_: str = "optimum_benchmark.backends.pytorch.PyTorchBackend"
-
-    # load options
-    no_weights: bool = False
-    device_map: Optional[str] = None
-    torch_dtype: Optional[str] = None
-
-    # quantization options
-    quantization_strategy: Optional[str] = None
-    quantization_config: Optional[Dict[str, Any]] = None
-
-    # optimization options
-    bettertransformer: bool = False
-
-    # compilation options
-    torch_compile: bool = False
-    torch_compile_kwargs: Optional[Dict] = None
-
-    # amp options
-    amp_autocast: bool = False
-    amp_dtype: Optional[str] = None
-
-    # inference options
-    disable_grad: bool = "${is_inference:${benchmark.name}}"  # type: ignore
-    eval_mode: bool = "${is_inference:${benchmark.name}}"  # type: ignore
-
-    # training options
-    use_ddp: bool = False
-    ddp_config: Optional[Dict[str, Any]] = None
-
-    def __post_init__(self):
-        """
-        Here we perform checks and transformations on the config.
-        But we never modify the types of the config values.
-        """
-
-        CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None)
-
-        if self.torch_compile:
-            self.torch_compile_kwargs = OmegaConf.merge(
-                self.torch_compile_kwargs
-                if self.torch_compile_kwargs is not None
-                else {},
-                DEFAULT_COMPILE_CONFIG,
-            )
-
-        if self.device_map is not None:
-            assert self.device_map in ["auto", "sequential"], (
-                "`device_map` must be one of ['auto', 'sequential']. "
-                "are supported in Optimum-Bnechmark. "
-                f"Got {type(self.device_map)} instead."
-            )
-            assert (
-                CUDA_VISIBLE_DEVICES is not None
-            ), "`device_map` can only be used when CUDA_VISIBLE_DEVICES is set."
-
-        if self.torch_dtype is not None:
-            assert self.torch_dtype in ["bfloat16", "float16", "float32", "auto"], (
-                "`torch_dtype` must be one of ['bfloat16', 'float16', 'float32', "
-                f"'auto']. Got {self.torch_dtype} instead."
-            )
-
-        if self.amp_dtype is not None:
-            assert self.amp_dtype in ["bfloat16", "float16", "float32"], (
-                "`amp_dtype` must be one of ['bfloat16', 'float16', 'float32']. "
-                f"Got {self.amp_dtype} instead."
-            )
-
-        if self.quantization_strategy is not None:
-            assert self.quantization_strategy in ["bnb", "gptq"], (
-                "`quantization_strategy` must be one of ['bnb', 'gptq']. "
-                f"Got {self.quantization_strategy} instead."
-            )
-            if self.quantization_strategy == "gptq":
-                bits = self.quantization_config.get("bits", None)
-                assert bits is not None, (
-                    "`quantization_config.bits` must be provided "
-                    "when using 'gptq' quantization strategy."
-                )
-        else:
-            self.quantization_config = None
-
-        if self.use_ddp:
-            self.ddp_config = OmegaConf.merge(
-                self.ddp_config if self.ddp_config is not None else {},
-                DEFAULT_DDP_CONFIG,
-            )
-
-            # TODO: support multi-node training.
-            assert self.ddp_config.max_nodes == 1, (
-                "Currently, PyTorch DDP training benchmark "
-                "only supports training on a single node."
-            )
-
-            assert (
-                CUDA_VISIBLE_DEVICES is not None
-            ), "Pytorch DDP training benchmark requires CUDA_VISIBLE_DEVICES to be set."
-        else:
-            self.ddp_config = None
-
-
-class PyTorchBackend(Backend):
-    name: str = "pytorch"
-    config: PyTorchConfig
-
-    def __init__(self, model: str, task: str, device: str, hub_kwargs: DictConfig):
-        super().__init__(model, task, device, hub_kwargs)
-        self.device = torch.device(device)
-
-        LOGGER.info(
-            f"\t+ Infered AutoModel class {self.automodel_class.__name__} "
-            f"for task {self.task} and model_type {self.model_type}"
-        )
-
-    def configure(self, config: PyTorchConfig) -> None:
-        super().configure(config)
-
-        # environment options
-        if self.config.inter_op_num_threads is not None:
-            LOGGER.info(
-                "\t+ Setting pytorch "
-                f"inter_op_num_threads({self.config.inter_op_num_threads}))"
-            )
-            torch.set_num_threads(self.config.inter_op_num_threads)
-        if self.config.intra_op_num_threads is not None:
-            LOGGER.info(
-                "\t+ Setting pytorch "
-                f"intra_op_num_threads({self.config.intra_op_num_threads}))"
-            )
-            torch.set_num_interop_threads(self.config.intra_op_num_threads)
-
-        # Load config
-        if self.config.torch_dtype is not None:
-            if hasattr(torch, self.config.torch_dtype):
-                self.config.torch_dtype = getattr(torch, self.config.torch_dtype)
-
-        # Inference config
-        if self.config.disable_grad:
-            LOGGER.info("\t+ Disabling gradients")
-            # everything that comes after this will have its gradients disabled
-            torch.set_grad_enabled(False)
-        if self.config.amp_dtype is not None:
-            if hasattr(torch, self.config.amp_dtype):
-                self.config.amp_dtype = getattr(torch, self.config.amp_dtype)
-
-        # Quantization config
-        if self.config.quantization_strategy is not None:
-            if self.config.quantization_strategy == "gptq":
-                self.config.quantization_config = GPTQConfig(
-                    **self.config.quantization_config
-                )
-            elif self.config.quantization_strategy == "bnb":
-                self.config.quantization_config = BitsAndBytesConfig(
-                    **self.config.quantization_config
-                )
-
-        # Load model
-        if self.config.no_weights:
-            self.load_model_from_config()
-        else:
-            self.load_model_from_pretrained()
-
-        # Turn on eval mode
-        if not self.is_diffusion_pipeline() and self.config.eval_mode:
-            LOGGER.info("\t+ Turning on eval mode")
-            self.pretrained_model.eval()
-
-        # Turn on BetterTransformer optimizations
-        if self.config.bettertransformer:
-            LOGGER.info("\t+ Using optimum.bettertransformer")
-            self.pretrained_model = BetterTransformer.transform(
-                self.pretrained_model,
-                keep_original_model=False,
-            )
-
-        # Compile model
-        if self.config.torch_compile:
-            if self.is_diffusion_pipeline():
-                LOGGER.info()
-                self.pretrained_model.unet = torch.compile(
-                    self.pretrained_model.unet,
-                    **self.config.torch_compile_kwargs,
-                )
-            else:
-                LOGGER.info("\t+ Using torch.compile on forward pass")
-                self.pretrained_model.forward = torch.compile(
-                    self.pretrained_model.forward,
-                    **self.config.torch_compile_kwargs,
-                )
-
-        # DDP config
-        if self.config.use_ddp:
-            self.config.ddp_config = LaunchConfig(**self.config.ddp_config)
-
-    def load_model_from_pretrained(self) -> None:
-        LOGGER.info(f"\t+ Loading pretrained model weights on device: {self.device}")
-        if self.is_diffusion_pipeline():
-            self.pretrained_model = self.automodel_class.from_pretrained(
-                pretrained_model_name_or_path=self.model,
-                torch_dtype=self.config.torch_dtype,
-                device_map=self.config.device_map,
-                **self.hub_kwargs,
-            )
-            if self.config.device_map is None:
-                # Diffusers does not support device_map being a torch.device,
-                # thus if not provided we move to device here.
-                self.pretrained_model.to(self.device)
-        else:
-            if self.config.device_map is not None:
-                self.pretrained_model = self.automodel_class.from_pretrained(
-                    pretrained_model_name_or_path=self.model,
-                    quantization_config=self.config.quantization_config,
-                    torch_dtype=self.config.torch_dtype,
-                    device_map=self.config.device_map,
-                    **self.hub_kwargs,
-                )
-            else:
-                with self.device:
-                    self.pretrained_model = self.automodel_class.from_pretrained(
-                        pretrained_model_name_or_path=self.model,
-                        quantization_config=self.config.quantization_config,
-                        torch_dtype=self.config.torch_dtype,
-                        **self.hub_kwargs,
-                    )
-
-    def load_model_from_config(self) -> None:
-        # TODO: create no_weights tests
-
-        LOGGER.info("\t+ Initializing empty weights model on device: meta")
-        with init_empty_weights():
-            self.pretrained_model = self.automodel_class.from_config(
-                config=self.pretrained_config,
-                torch_dtype=self.config.torch_dtype,
-                trust_remote_code=self.hub_kwargs.get("trust_remote_code", False),
-            )
-
-        if self.config.quantization_strategy is None:
-            LOGGER.info(f"\t+ Materializing model on device: {self.device}")
-            self.pretrained_model.to_empty(device=self.device)
-
-            LOGGER.info("\t+ Randomizing model weights")
-            randomize_weights(self.pretrained_model)
-            self.pretrained_model.tie_weights()
-        else:
-            LOGGER.info("\t+ Materializing model on device: cpu")
-            self.pretrained_model.to_empty(device="cpu")
-
-            LOGGER.info("\t+ Randomizing model weights while on device: cpu")
-            randomize_weights(self.pretrained_model)
-            self.pretrained_model.tie_weights()
-
-            if self.config.quantization_strategy == "bnb":
-                quantization_config = BitsAndBytesConfig(**self.quantization_config)
-            elif self.config.quantization_strategy == "gptq":
-                raise NotImplementedError(
-                    "GPTQ requires a pretrained model to be loaded. "
-                    "`no_weights` option is not supported with GPTQ."
-                )
-
-            from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model
-
-            # translating transformers bnb config to accelerate bnb config
-            bnb_quantization_config = BnbQuantizationConfig(
-                load_in_4bit=quantization_config.load_in_4bit,
-                load_in_8bit=quantization_config.load_in_8bit,
-                # with dummy_weights, we set this to 0 for reproducibility
-                llm_int8_threshold=0,
-                torch_dtype=self.config.torch_dtype,
-                keep_in_fp32_modules=self.pretrained_model.keep_in_fp32_modules
-                if hasattr(self.pretrained_model, "keep_in_fp32_modules")
-                else None,
-            )
-
-            LOGGER.info("\t+ Quantizing model while on cpu and dispatching to device")
-            self.pretrained_model = load_and_quantize_model(
-                model=self.pretrained_model,
-                bnb_quantization_config=bnb_quantization_config,
-                device_map=self.config.device_map
-                if self.config.device_map is not None
-                else self.device,
-            )
-
-    def prepare_for_profiling(self, input_names: List[str]) -> None:
-        LOGGER.info("Preparing model for profiling")
-        LOGGER.info("\t+ Symbolicly tracing model")
-        self.pretrained_model = symbolic_trace(
-            model=self.pretrained_model,
-            input_names=input_names,
-        )
-
-        LOGGER.info("\t+ Wrapping model with FXProfilingWrapper")
-        self.pretrained_model = FXProfilingWrapper(self.pretrained_model)
-
-    def forward(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput":
-        with torch.autocast(
-            enabled=self.config.amp_autocast,
-            device_type=self.device.type,
-            dtype=self.config.amp_dtype,
-        ):
-            output = self.pretrained_model(**input, **kwargs)
-
-        return output
-
-    def generate(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput":
-        with torch.autocast(
-            enabled=self.config.amp_autocast,
-            device_type=self.device.type,
-            dtype=self.config.amp_dtype,
-        ):
-            output = self.pretrained_model.generate(**input, **kwargs)
-
-        return output
-
-    @record
-    def train(
-        self,
-        training_dataset: "Dataset",
-        training_arguments: Dict[str, Any],
-        training_callbacks: List["TrainerCallback"],
-        training_data_collator: Callable,
-    ) -> "TrainerState":
-        args = (
-            self.config.use_ddp,
-            self.pretrained_model,
-            training_dataset,
-            training_arguments,
-            training_callbacks,
-            training_data_collator,
-        )
-
-        if self.config.use_ddp:
-            # For DDP, we log only the stats from the first rank as transformers does.
-            # It could make sense to log for all ranks.
-            results = elastic_launch(
-                config=self.config.ddp_config,
-                entrypoint=training_worker,
-            )(args)[0]
-        else:
-            # For DP, we can still use training_worker,
-            # simply not wrapped by the elastic_launch class.
-            results = training_worker(args)
-
-        return results
-
-    def clean(self) -> None:
-        super().clean()
-
-        if self.device.type == "cuda":
-            torch.cuda.empty_cache()
-            gc.collect()
-
-
-def training_worker(args) -> "TrainerState":
-    use_ddp = args[0]
-    pretrained_model = args[1]
-    training_dataset = args[2]
-    training_arguments = args[3]
-    training_callbacks = args[4]
-    training_data_collator = args[5]
-
-    if use_ddp:
-        LOGGER_WORKER = get_worker_logger("pytorch-ddp-worker", log_all=False)
-
-        env_variables = [
-            "RANK",
-            "WORLD_SIZE",
-            "MASTER_ADDR",
-            "MASTER_PORT",
-            "TORCHELASTIC_MAX_RESTARTS",
-        ]
-
-        LOGGER_WORKER.info("Initializing DDP worker")
-        for env_var in env_variables:
-            LOGGER_WORKER.info(f"{env_var}: {os.environ.get(env_var)}")
-    else:
-        LOGGER_WORKER = LOGGER
-
-    LOGGER_WORKER.info("\t+ Setting dataset format to `torch`.")
-    training_dataset.set_format(
-        type="torch", columns=list(training_dataset.features.keys())
-    )
-
-    LOGGER_WORKER.info(
-        "\t+ Wrapping training arguments with transformers.TrainingArguments"
-    )
-    training_arguments = TrainingArguments(**training_arguments)
-
-    LOGGER_WORKER.info("\t+ Wrapping model with transformers.Trainer")
-    trainer = Trainer(
-        model=pretrained_model,
-        args=training_arguments,
-        callbacks=training_callbacks,
-        train_dataset=training_dataset,
-        data_collator=training_data_collator,
-    )
-
-    LOGGER_WORKER.info("\t+ Starting training")
-    trainer.train()
-    LOGGER_WORKER.info("\t+ Training finished successfully")
-    trainer_state = trainer.state
-
-    return trainer_state
diff --git a/optimum_benchmark/backends/pytorch/__init__.py b/optimum_benchmark/backends/pytorch/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/optimum_benchmark/backends/pytorch/backned.py b/optimum_benchmark/backends/pytorch/backned.py
new file mode 100644
index 000000000..a83482da0
--- /dev/null
+++ b/optimum_benchmark/backends/pytorch/backned.py
@@ -0,0 +1,265 @@
+import gc
+import os
+from logging import getLogger
+from typing import TYPE_CHECKING, Any, Callable, Dict, List
+
+import torch
+from accelerate import init_empty_weights
+from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model
+from optimum.bettertransformer import BetterTransformer
+from torch.distributed.elastic.multiprocessing.errors import record
+from torch.distributed.launcher.api import LaunchConfig, elastic_launch
+from transformers import BitsAndBytesConfig, GPTQConfig, Trainer, TrainingArguments
+from transformers.utils.fx import symbolic_trace
+
+if TYPE_CHECKING:
+    from datasets import Dataset
+    from transformers import TrainerCallback, TrainerState
+    from transformers.utils import ModelOutput
+
+from ...profilers.fx_profiler import FXProfilingWrapper
+from ..base import Backend
+from .config import PyTorchConfig
+from .utils import get_worker_logger, randomize_weights
+
+# bachend logger
+LOGGER = getLogger("pytorch")
+
+
+class PyTorchBackend(Backend[PyTorchConfig]):
+    NAME: str = "pytorch"
+
+    def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any]):
+        super().__init__(model, task, device, hub_kwargs)
+
+        automodel = self.automodel_class.__name__
+        LOGGER.info(f"\t+ Infered AutoModel class {automodel} for task {self.task} and model_type {self.model_type}")
+
+    def configure(self, config: PyTorchConfig) -> None:
+        super().configure(config)
+
+        # Gradients options
+        if self.config.disable_grad:
+            LOGGER.info("\t+ Disabling gradients")
+            torch.set_grad_enabled(False)
+
+        # Threading options
+        if self.config.inter_op_num_threads is not None:
+            LOGGER.info(f"\t+ Setting pytorch inter_op_num_threads({self.config.inter_op_num_threads}))")
+            torch.set_num_threads(self.config.inter_op_num_threads)
+        if self.config.intra_op_num_threads is not None:
+            LOGGER.info(f"\t+ Setting pytorch intra_op_num_threads({self.config.intra_op_num_threads}))")
+            torch.set_num_interop_threads(self.config.intra_op_num_threads)
+
+        # Dtypes options
+        self.torch_dtype = getattr(torch, self.config.torch_dtype) if self.config.torch_dtype is not None else None
+        self.amp_dtype = getattr(torch, self.config.amp_dtype) if self.config.amp_dtype is not None else None
+
+        # Load model
+        if self.config.no_weights:
+            self.load_model_from_config()
+        else:
+            self.load_model_from_pretrained()
+
+        # Eval mode
+        if self.config.eval_mode:
+            if self.is_diffusion_pipeline():
+                LOGGER.info("\t+ Diffusion pipeline are always in eval mode")
+            else:
+                LOGGER.info("\t+ Turning on model's eval mode")
+                self.pretrained_model.eval()
+
+        # BetterTransformer
+        if self.config.bettertransformer:
+            LOGGER.info("\t+ Using optimum.bettertransformer")
+            self.pretrained_model = BetterTransformer.transform(
+                self.pretrained_model,
+                keep_original_model=False,
+            )
+
+        # Compile model
+        if self.config.torch_compile:
+            if self.is_diffusion_pipeline():
+                LOGGER.info("\t+ Using torch.compile on unet forward pass")
+                # TODO: should we compile vae and/or clip as well ?
+                self.pretrained_model.unet.forward = torch.compile(
+                    self.pretrained_model.unet.forward,
+                    **self.config.torch_compile_kwargs,
+                )
+            else:
+                LOGGER.info("\t+ Using torch.compile on forward pass")
+                self.pretrained_model.forward = torch.compile(
+                    self.pretrained_model.forward,
+                    **self.config.torch_compile_kwargs,
+                )
+
+    def load_model_from_pretrained(self) -> None:
+        if self.config.quantization_strategy == "gptq":
+            LOGGER.info("\t+ Processing GPTQ config")
+            quantization_config = GPTQConfig(**self.config.quantization_config)
+        elif self.config.quantization_strategy == "bnb":
+            LOGGER.info("\t+ Processing BnB config")
+            quantization_config = BitsAndBytesConfig(**self.config.quantization_config)
+        else:
+            quantization_config = None
+
+        if self.is_diffusion_pipeline():
+            LOGGER.info("\t+ Loading diffusion pipeline")
+            self.pretrained_model = self.automodel_class.from_pretrained(
+                self.model,
+                torch_dtype=self.torch_dtype,
+                device_map=self.config.device_map,
+                **self.hub_kwargs,
+            )
+            if self.config.device_map is None:
+                LOGGER.info(f"\t+ Moving diffusion pipeline to device: {self.device}")
+                # Diffusers does not support loading with torch.device context manager
+                self.pretrained_model.to(self.device)
+        else:
+            if self.config.device_map is not None:
+                LOGGER.info(f"\t+ Loading model on visible cuda devices with device_map: {self.config.device_map}")
+                self.pretrained_model = self.automodel_class.from_pretrained(
+                    self.model,
+                    torch_dtype=self.torch_dtype,
+                    device_map=self.config.device_map,
+                    quantization_config=quantization_config,
+                    **self.hub_kwargs,
+                )
+            else:
+                LOGGER.info(f"\t+ Loading model on device: {self.device}")
+                with self.device:
+                    self.pretrained_model = self.automodel_class.from_pretrained(
+                        self.model,
+                        torch_dtype=self.torch_dtype,
+                        quantization_config=quantization_config,
+                        **self.hub_kwargs,
+                    )
+
+    def load_model_from_config(self) -> None:
+        # TODO: create no_weights tests
+        LOGGER.info("\t+ Initializing empty weights model on device: meta")
+        with init_empty_weights():
+            self.pretrained_model = self.automodel_class.from_config(
+                config=self.pretrained_config,
+                torch_dtype=self.config.torch_dtype,
+                trust_remote_code=self.hub_kwargs.get("trust_remote_code", False),
+            )
+
+        if self.config.quantization_strategy is not None:
+            LOGGER.info("\t+ Materializing model on cpu for quantization to not OOM")
+            self.pretrained_model.to_empty(device="cpu")
+            LOGGER.info("\t+ Randomizing model weights")
+            randomize_weights(self.pretrained_model)
+            LOGGER.info("\t+ Processing BnB config")
+            bnb_quantization_config = BnbQuantizationConfig(
+                **self.config.quantization_config,
+                torch_dtype=self.config.torch_dtype,
+                keep_in_fp32_modules=self.pretrained_model.keep_in_fp32_modules
+                if hasattr(self.pretrained_model, "keep_in_fp32_modules")
+                else None,
+            )
+            LOGGER.info("\t+ Quantizing model while on cpu and dispatching to device")
+            self.pretrained_model = load_and_quantize_model(
+                self.pretrained_model, bnb_quantization_config, device_map=self.config.device_map or self.device
+            )
+        else:
+            LOGGER.info(f"\t+ Materializing model on device: {self.device}")
+            self.pretrained_model.to_empty(device=self.device)
+            LOGGER.info("\t+ Randomizing model weights")
+            randomize_weights(self.pretrained_model)
+
+        LOGGER.info("\t+ Tying weights")
+        self.pretrained_model.tie_weights()
+
+    def prepare_for_profiling(self, input_names: List[str]) -> None:
+        LOGGER.info("Preparing model for profiling")
+        LOGGER.info("\t+ Symbolicly tracing model")
+        self.pretrained_model = symbolic_trace(self.pretrained_model, input_names=input_names)
+        LOGGER.info("\t+ Wrapping model with FXProfilingWrapper")
+        self.pretrained_model = FXProfilingWrapper(self.pretrained_model)
+
+    def forward(self, input: Dict[str, torch.Tensor], **kwargs) -> "ModelOutput":
+        if self.is_diffusion_pipeline():
+            return super().forward(input, **kwargs)
+        else:
+            # TODO: autocast as whole can be managed by one config/kwargs
+            with torch.autocast(device_type=self.device.type, dtype=self.amp_dtype, enabled=self.config.amp_autocast):
+                return super().forward(input, **kwargs)
+
+    def generate(self, input: Dict[str, torch.Tensor], **kwargs) -> "ModelOutput":
+        if self.is_diffusion_pipeline():
+            return super().generate(input, **kwargs)
+        else:
+            # TODO: autocast as whole can be managed by one config/kwargs
+            with torch.autocast(device_type=self.device.type, dtype=self.amp_dtype, enabled=self.config.amp_autocast):
+                return super().generate(input, **kwargs)
+
+    @record
+    def train(
+        self,
+        training_dataset: "Dataset",
+        training_arguments: Dict[str, Any],
+        training_callbacks: List["TrainerCallback"],
+        training_data_collator: Callable,
+    ) -> "TrainerState":
+        args = (
+            self.config.use_ddp,
+            self.pretrained_model,
+            training_dataset,
+            training_arguments,
+            training_callbacks,
+            training_data_collator,
+        )
+
+        if self.config.use_ddp:
+            # For DDP, we log only the state of the first rank as transformers does.
+            # since the batch size used in measuring the throughput is the one of world size.
+            ddp_config = LaunchConfig(**self.config.ddp_config)
+            results = elastic_launch(config=ddp_config, entrypoint=training_worker)(args)[0]
+        else:
+            # For DP, we can still use training_worker, simply not wrapped by the elastic_launch class.
+            results = training_worker(args)
+
+        return results
+
+    def clean(self) -> None:
+        super().clean()
+
+        if self.device.type == "cuda":
+            torch.cuda.empty_cache()
+            gc.collect()
+
+
+def training_worker(args) -> "TrainerState":
+    use_ddp = args[0]
+    pretrained_model = args[1]
+    training_dataset = args[2]
+    training_arguments = args[3]
+    training_callbacks = args[4]
+    training_data_collator = args[5]
+
+    if use_ddp:
+        LOGGER_WORKER = get_worker_logger("pytorch-ddp-worker", log_all=False)
+        env_variables = ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "TORCHELASTIC_MAX_RESTARTS"]
+        LOGGER_WORKER.info("Initializing DDP worker")
+        for env_var in env_variables:
+            LOGGER_WORKER.info(f"{env_var}: {os.environ.get(env_var)}")
+    else:
+        LOGGER_WORKER = LOGGER
+
+    LOGGER_WORKER.info("\t+ Setting dataset format to `torch`.")
+    training_dataset.set_format(type="torch", columns=list(training_dataset.features.keys()))
+    LOGGER_WORKER.info("\t+ Wrapping training arguments with transformers.TrainingArguments")
+    training_arguments = TrainingArguments(**training_arguments)
+    LOGGER_WORKER.info("\t+ Wrapping model with transformers.Trainer")
+    trainer = Trainer(
+        model=pretrained_model,
+        args=training_arguments,
+        callbacks=training_callbacks,
+        train_dataset=training_dataset,
+        data_collator=training_data_collator,
+    )
+    LOGGER_WORKER.info("\t+ Starting training")
+    trainer.train()
+    LOGGER_WORKER.info("\t+ Training finished successfully")
+    return trainer.state
diff --git a/optimum_benchmark/backends/pytorch/config.py b/optimum_benchmark/backends/pytorch/config.py
new file mode 100644
index 000000000..ab2cc8fa9
--- /dev/null
+++ b/optimum_benchmark/backends/pytorch/config.py
@@ -0,0 +1,143 @@
+import importlib.metadata
+import os
+from dataclasses import dataclass, field
+from typing import Any, Dict, Optional
+
+from omegaconf import OmegaConf
+
+from ..base import BackendConfig
+
+OmegaConf.register_new_resolver(
+    "device_count",
+    lambda: len(os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",")),
+)
+OmegaConf.register_new_resolver(
+    "is_inference",
+    lambda benchmark_name: benchmark_name == "inference",
+)
+OmegaConf.register_new_resolver(
+    "pytorch_version",
+    lambda: importlib.metadata.version("torch"),
+)
+
+DEVICE_MAPS = ["auto", "sequential"]
+AMP_DTYPES = ["bfloat16", "float16"]
+TORCH_DTYPES = ["bfloat16", "float16", "float32", "auto"]
+
+GPTQ_CONFIG = {
+    "bits": 4,
+}
+BNB_CONFIG = {
+    "load_in_8bit": False,
+    "load_in_4bit": False,
+    "llm_int8_threshold": 0.0,
+}
+QUANTIZATION_CONFIGS = {
+    "gptq": GPTQ_CONFIG,
+    "bnb": BNB_CONFIG,
+}
+COMPILE_CONFIG = {
+    "fullgraph": False,
+    "dynamic": False,
+    "backend": "inductor",
+    "mode": None,
+    "options": None,
+    "disable": False,
+}
+# from launchConfig in https://github.com/pytorch/pytorch/blob/v2.0.0/torch/distributed/launcher/api.py#L29 adjusted
+# to defaults of torch.distributed.run in https://github.com/pytorch/pytorch/blob/v2.0.0/torch/distributed/run.py#L770
+DDP_CONFIG = {
+    "min_nodes": 1,
+    "max_nodes": 1,
+    "run_id": "none",
+    "nproc_per_node": "${device_count:}",
+    "role": "default",
+    "rdzv_endpoint": "127.0.0.1:29500",
+    "rdzv_backend": "static",
+    "rdzv_configs": {
+        "timeout": 900,
+        "rank": 0,
+    },
+    "max_restarts": 0,
+    "monitor_interval": 5,
+    "start_method": "spawn",
+    "log_dir": None,
+    "metrics_cfg": {},
+    "local_addr": None,
+}
+
+
+@dataclass
+class PyTorchConfig(BackendConfig):
+    name: str = "pytorch"
+    version: str = "${pytorch_version:}"
+    _target_: str = "optimum_benchmark.backends.pytorch.backned.PyTorchBackend"
+
+    # load options
+    no_weights: bool = False
+    device_map: Optional[str] = None
+    torch_dtype: Optional[str] = None
+
+    # inference options
+    disable_grad: bool = "${is_inference:${benchmark.name}}"
+    eval_mode: bool = "${is_inference:${benchmark.name}}"
+
+    # automatic mixed precision options
+    amp_autocast: bool = False
+    amp_dtype: Optional[str] = None
+
+    # compilation options
+    torch_compile: bool = False
+    torch_compile_config: Dict[str, Any] = field(default_factory=dict)
+
+    # optimization options
+    bettertransformer: bool = False
+
+    # quantization options
+    quantization_strategy: Optional[str] = None
+    quantization_config: Dict[str, Any] = field(default_factory=dict)
+
+    # training options
+    use_ddp: bool = False
+    ddp_config: Dict[str, Any] = field(default_factory=dict)
+
+    def __post_init__(self):
+        CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+
+        if self.torch_compile:
+            self.torch_compile_config = OmegaConf.to_container(
+                OmegaConf.merge(COMPILE_CONFIG, self.torch_compile_config)
+            )
+
+        if self.device_map is not None:
+            assert CUDA_VISIBLE_DEVICES is not None, "`device_map` can only be used when CUDA_VISIBLE_DEVICES is set."
+
+            if self.device_map not in DEVICE_MAPS:
+                raise ValueError(f"`device_map` must be one of {DEVICE_MAPS}. Got {self.device_map} instead.")
+
+        if self.torch_dtype is not None:
+            if self.torch_dtype not in TORCH_DTYPES:
+                raise ValueError(f"`torch_dtype` must be one of {TORCH_DTYPES}. Got {self.torch_dtype} instead.")
+
+        if self.amp_dtype is not None:
+            if self.amp_dtype not in AMP_DTYPES:
+                raise ValueError(f"`amp_dtype` must be one of {AMP_DTYPES}. Got {self.amp_dtype} instead.")
+
+        if self.quantization_strategy is not None:
+            if self.quantization_strategy not in QUANTIZATION_CONFIGS:
+                raise ValueError(
+                    f"`quantization_strategy` must be one of {list(QUANTIZATION_CONFIGS.keys())}. Got {self.quantization_strategy} instead."
+                )
+            QUANTIZATION_CONFIG = QUANTIZATION_CONFIGS[self.quantization_strategy]
+            self.quantization_config = OmegaConf.to_container(
+                OmegaConf.merge(QUANTIZATION_CONFIG, self.quantization_config)
+            )
+
+        if self.use_ddp:
+            if CUDA_VISIBLE_DEVICES is None:
+                raise ValueError("`use_ddp` can only be used when CUDA_VISIBLE_DEVICES is set.")
+
+            self.ddp_config = OmegaConf.to_container(OmegaConf.merge(DDP_CONFIG, self.ddp_config), resolve=True)
+            # TODO: check if it's not possible to use DDP with multiple nodes
+            if self.ddp_config["max_nodes"] > 1 or self.ddp_config["min_nodes"] > 1:
+                raise NotImplementedError("Currently, PyTorch DDP benchmark only supports training on a single node.")
diff --git a/optimum_benchmark/backends/pytorch/utils.py b/optimum_benchmark/backends/pytorch/utils.py
new file mode 100644
index 000000000..38cecdf5e
--- /dev/null
+++ b/optimum_benchmark/backends/pytorch/utils.py
@@ -0,0 +1,35 @@
+import logging.config
+import os
+from logging import getLogger
+from typing import Optional
+
+import torch
+from omegaconf import OmegaConf
+
+
+def randomize_weights(model):
+    for param in model.parameters():
+        if torch.cuda.is_available() and param.device.type == "cpu":
+            # we take advantage of the fact that a cuda device
+            # is available to use cuda kernels for randomization
+            # this is slower than asynchronous randomization while
+            # model is fully on gpu (because of data transfer) but
+            # faster than randomization while model is on cpu
+            param.data.cuda().normal_(mean=0.0, std=0.2).cpu()
+        else:
+            param.data.normal_(mean=0.0, std=0.2)
+
+
+def get_worker_logger(
+    name: Optional[str] = None,
+    log_all: bool = False,
+) -> logging.Logger:
+    """PyTorch DDP subprocesses do not inherit from Hydra logger.
+    Thus, we need to reconfigure the logger for the workers.
+    """
+    if os.environ["RANK"] == "0" or log_all:
+        # TODO: also configure logging for other ranks
+        hydra_conf = OmegaConf.load(".hydra/hydra.yaml")
+        logging.config.dictConfig(OmegaConf.to_container(hydra_conf.hydra.job_logging, resolve=True))
+
+    return getLogger(name)
diff --git a/optimum_benchmark/backends/utils.py b/optimum_benchmark/backends/utils.py
new file mode 100644
index 000000000..38df49a93
--- /dev/null
+++ b/optimum_benchmark/backends/utils.py
@@ -0,0 +1,176 @@
+import os
+import signal
+import subprocess
+import time
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+if TYPE_CHECKING:
+    from transformers import (
+        FeatureExtractionMixin,
+        ImageProcessingMixin,
+        Pipeline,
+        PretrainedConfig,
+        PreTrainedTokenizer,
+        ProcessorMixin,
+    )
+
+    PreTrainedProcessor = Union[
+        PreTrainedTokenizer,
+        ImageProcessingMixin,
+        FeatureExtractionMixin,
+        ProcessorMixin,
+    ]
+
+
+def extract_shapes_from_diffusion_pipeline(pipeline: "Pipeline") -> Dict[str, Any]:
+    # this is the only way I found to extract a diffusion pipeline's "input" shapes
+    shapes = {}
+    if hasattr(pipeline, "vae_encoder") and hasattr(pipeline.vae_encoder, "config"):
+        shapes["num_channels"] = pipeline.vae_encoder.config["out_channels"]
+        shapes["height"] = pipeline.vae_encoder.config["sample_size"]
+        shapes["width"] = pipeline.vae_encoder.config["sample_size"]
+    elif hasattr(pipeline, "vae") and hasattr(pipeline.vae, "config"):
+        shapes["num_channels"] = pipeline.vae.config.out_channels
+        shapes["height"] = pipeline.vae.config.sample_size
+        shapes["width"] = pipeline.vae.config.sample_size
+    else:
+        shapes["num_channels"] = -1
+        shapes["height"] = -1
+        shapes["width"] = -1
+
+    return shapes
+
+
+def extract_shapes_from_model_artifacts(
+    config: "PretrainedConfig", processor: Optional["PreTrainedProcessor"] = None
+) -> Dict[str, Any]:
+    shapes = {}
+    artifacts_dict = {}
+
+    config_dict = {k: v for k, v in config.to_dict().items() if v is not None}
+    artifacts_dict.update(config_dict)
+
+    if processor is not None and hasattr(processor, "to_dict"):
+        processor_dict = {k: v for k, v in processor.to_dict().items() if v is not None}
+        artifacts_dict.update(processor_dict)
+
+    # text input
+    shapes["vocab_size"] = artifacts_dict.get("vocab_size", 2)
+    shapes["type_vocab_size"] = artifacts_dict.get("type_vocab_size", 2)
+
+    # image input
+    shapes["num_channels"] = artifacts_dict.get("num_channels", None)
+
+    image_size = artifacts_dict.get("image_size", None)
+    if image_size is None:
+        # processors have different names for the image size
+        image_size = artifacts_dict.get("size", None)
+
+    if isinstance(image_size, (int, float)):
+        shapes["height"] = image_size
+        shapes["width"] = image_size
+    elif isinstance(image_size, (list, tuple)):
+        shapes["height"] = image_size[0]
+        shapes["width"] = image_size[0]
+    elif isinstance(image_size, dict) and len(image_size) == 2:
+        shapes["height"] = list(image_size.values())[0]
+        shapes["width"] = list(image_size.values())[1]
+    elif isinstance(image_size, dict) and len(image_size) == 1:
+        shapes["height"] = list(image_size.values())[0]
+        shapes["width"] = list(image_size.values())[0]
+    else:
+        shapes["height"] = None
+        shapes["width"] = None
+
+    # classification labels (default to 2)
+    shapes["num_labels"] = len(artifacts_dict.get("id2label", {"0": "LABEL_0", "1": "LABEL_1"}))
+
+    # object detection labels (default to 2)
+    shapes["num_queries"] = artifacts_dict.get("num_queries", 2)
+
+    return shapes
+
+
+def check_no_process_is_running_on_cuda_device(device_ids: List[int]) -> None:
+    """Raises a RuntimeError if any process is running on the given cuda device."""
+    for device_id in device_ids:
+        # get list of all PIDs running on nvidia devices
+        pids = [
+            int(pid)
+            for pid in subprocess.check_output(["nvidia-smi", "--query-compute-apps=pid", "--format=csv,noheader"])
+            .decode()
+            .strip()
+            .split("\n")
+            if pid != ""
+        ]
+
+        # get list of PIDs running on cuda device_id
+        pids_on_device_id = {
+            pid
+            for pid in pids
+            if subprocess.check_output(
+                [
+                    "nvidia-smi",
+                    "--query-compute-apps=pid,used_memory",
+                    "--format=csv,noheader,nounits",
+                    f"--id={device_id}",
+                ]
+            )
+            .decode()
+            .startswith(f"{pid},")
+        }
+
+        # TODO: It would be safer to run each run of a sweep in a subprocess.
+        # Although we can trust PyTorch to clear GPU memory when asked,
+        # it is not a safe assumption to make for all backends.
+        if len(pids_on_device_id) > 1 or (len(pids_on_device_id) == 1 and os.getpid() not in pids_on_device_id):
+            raise RuntimeError(
+                f"Expected no processes on device {device_id}, "
+                f"found {len(pids_on_device_id)} processes "
+                f"with PIDs {pids_on_device_id}."
+            )
+
+
+def check_only_this_process_is_running_on_cuda_device(device_ids: List[int], pid) -> None:
+    """Raises a RuntimeError if at any point in time, there is a process running
+    on the given cuda device that is not the current process.
+    """
+    while True:
+        # get list of all PIDs running on nvidia devices
+        pids = [
+            int(pid)
+            for pid in subprocess.check_output(["nvidia-smi", "--query-compute-apps=pid", "--format=csv,noheader"])
+            .decode()
+            .strip()
+            .split("\n")
+            if pid != ""
+        ]
+
+        for device_id in device_ids:
+            # get list of PIDs running on cuda device_id
+            pids_on_device_id = {
+                pid
+                for pid in pids
+                if subprocess.check_output(
+                    [
+                        "nvidia-smi",
+                        "--query-compute-apps=pid,used_memory",
+                        "--format=csv,noheader,nounits",
+                        f"--id={device_id}",
+                    ]
+                )
+                .decode()
+                .startswith(f"{pid},")
+            }
+
+            # check if there is a process running on device_id that is not the current process
+            if len(pids_on_device_id) > 1:
+                os.kill(pid, signal.SIGTERM)
+                raise RuntimeError(
+                    f"Expected only process {pid} on device {device_id}, "
+                    f"found {len(pids_on_device_id)} processes "
+                    f"with PIDs {pids_on_device_id}."
+                )
+
+        # sleep for 1 second
+        time.sleep(1)
diff --git a/optimum_benchmark/backends/utils/base_utils.py b/optimum_benchmark/backends/utils/base_utils.py
deleted file mode 100644
index 7f357be9d..000000000
--- a/optimum_benchmark/backends/utils/base_utils.py
+++ /dev/null
@@ -1,92 +0,0 @@
-from typing import Any, Dict, Optional, Union
-
-from diffusers import DiffusionPipeline
-from transformers import (
-    ProcessorMixin,
-    PretrainedConfig,
-    PreTrainedTokenizer,
-    ImageProcessingMixin,
-    FeatureExtractionMixin,
-)
-
-
-PreTrainedProcessor = Union[
-    PreTrainedTokenizer,
-    ImageProcessingMixin,
-    FeatureExtractionMixin,
-    ProcessorMixin,
-]
-
-
-def extract_shapes_from_diffusion_pipeline(
-    pipeline: DiffusionPipeline,
-) -> Dict[str, Any]:
-    # this is the only way I found to extract a diffusion pipeline's "input" shapes
-    shapes = {}
-    if hasattr(pipeline, "vae_encoder") and hasattr(pipeline.vae_encoder, "config"):
-        shapes["num_channels"] = pipeline.vae_encoder.config["out_channels"]
-        shapes["height"] = pipeline.vae_encoder.config["sample_size"]
-        shapes["width"] = pipeline.vae_encoder.config["sample_size"]
-    elif hasattr(pipeline, "vae") and hasattr(pipeline.vae, "config"):
-        shapes["num_channels"] = pipeline.vae.config.out_channels
-        shapes["height"] = pipeline.vae.config.sample_size
-        shapes["width"] = pipeline.vae.config.sample_size
-    else:
-        shapes["num_channels"] = -1
-        shapes["height"] = -1
-        shapes["width"] = -1
-
-    return shapes
-
-
-def extract_shapes_from_model_artifacts(
-    config: PretrainedConfig,
-    processor: Optional[PreTrainedProcessor] = None,
-) -> Dict[str, Any]:
-    shapes = {}
-    artifacts_dict = {}
-
-    config_dict = {k: v for k, v in config.to_dict().items() if v is not None}
-    artifacts_dict.update(config_dict)
-
-    if processor is not None and hasattr(processor, "to_dict"):
-        processor_dict = {k: v for k, v in processor.to_dict().items() if v is not None}
-        artifacts_dict.update(processor_dict)
-
-    # text input
-    shapes["vocab_size"] = artifacts_dict.get("vocab_size", 2)
-    shapes["type_vocab_size"] = artifacts_dict.get("type_vocab_size", 2)
-
-    # image input
-    shapes["num_channels"] = artifacts_dict.get("num_channels", None)
-
-    image_size = artifacts_dict.get("image_size", None)
-    if image_size is None:
-        # processors have different names for the image size
-        image_size = artifacts_dict.get("size", None)
-
-    if isinstance(image_size, (int, float)):
-        shapes["height"] = image_size
-        shapes["width"] = image_size
-    elif isinstance(image_size, (list, tuple)):
-        shapes["height"] = image_size[0]
-        shapes["width"] = image_size[0]
-    elif isinstance(image_size, dict) and len(image_size) == 2:
-        shapes["height"] = list(image_size.values())[0]
-        shapes["width"] = list(image_size.values())[1]
-    elif isinstance(image_size, dict) and len(image_size) == 1:
-        shapes["height"] = list(image_size.values())[0]
-        shapes["width"] = list(image_size.values())[0]
-    else:
-        shapes["height"] = None
-        shapes["width"] = None
-
-    # classification labels (default to 2)
-    shapes["num_labels"] = len(
-        artifacts_dict.get("id2label", {"0": "LABEL_0", "1": "LABEL_1"})
-    )
-
-    # object detection labels (default to 2)
-    shapes["num_queries"] = artifacts_dict.get("num_queries", 2)
-
-    return shapes
diff --git a/optimum_benchmark/backends/utils/neural_compressor_utils.py b/optimum_benchmark/backends/utils/neural_compressor_utils.py
deleted file mode 100644
index 96632df48..000000000
--- a/optimum_benchmark/backends/utils/neural_compressor_utils.py
+++ /dev/null
@@ -1,39 +0,0 @@
-DEFAULT_QUANTIZATION_CONFIG = {
-    "device": "cpu",
-    "backend": "default",
-    "domain": "auto",
-    "recipes": {},
-    "quant_format": "default",
-    "inputs": [],
-    "outputs": [],
-    "approach": "static",
-    "calibration_sampling_size": [100],
-    "op_type_dict": None,
-    "op_name_dict": None,
-    "reduce_range": None,
-    "example_inputs": None,
-    "excluded_precisions": [],
-    "quant_level": "auto",
-    "accuracy_criterion": {
-        "higher_is_better": True,
-        "criterion": "relative",
-        "tolerable_loss": 0.01,
-    },
-    "tuning_criterion": {
-        "strategy": "basic",
-        "strategy_kwargs": None,
-        "timeout": 0,
-        "max_trials": 100,
-        "objective": "performance",
-    },
-    "diagnosis": False,
-}
-
-DEFAULT_CALIBRATION_CONFIG = {
-    "dataset_name": "glue",
-    "num_samples": 300,
-    "dataset_config_name": "sst2",
-    "dataset_split": "train",
-    "preprocess_batch": True,
-    "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor",
-}
diff --git a/optimum_benchmark/backends/utils/onnxruntime_utils.py b/optimum_benchmark/backends/utils/onnxruntime_utils.py
deleted file mode 100644
index 65568458a..000000000
--- a/optimum_benchmark/backends/utils/onnxruntime_utils.py
+++ /dev/null
@@ -1,94 +0,0 @@
-from typing import Any, Dict
-
-
-DEFAULT_OPTIMIZATION_CONFIG = {
-    "optimization_level": 1,  # 0, 1, 2, 99
-    "optimize_for_gpu": "${is_gpu:${device}}",
-    "fp16": False,
-    "enable_transformers_specific_optimizations": True,
-    "enable_gelu_approximation": False,
-    "disable_gelu_fusion": False,
-    "disable_layer_norm_fusion": False,
-    "disable_attention_fusion": False,
-    "disable_skip_layer_norm_fusion": True,
-    "disable_bias_skip_layer_norm_fusion": False,
-    "disable_bias_gelu_fusion": False,
-    "use_mask_index": False,
-    "no_attention_mask": False,
-    "disable_embed_layer_norm_fusion": True,
-    "disable_shape_inference": False,
-    "use_multi_head_attention": False,
-    "enable_gemm_fast_gelu_fusion": False,
-    "use_raw_attention_mask": False,
-    "disable_group_norm_fusion": True,
-    "disable_packed_kv": True,
-}
-
-DEFAULT_QUANTIZATION_CONFIG = {
-    "is_static": False,
-    "format": "QOperator",  # QOperator, QDQ
-    "mode": "IntegerOps",  # QLinearOps, IntegerOps
-    "activations_dtype": "QUInt8",  # QInt8, QUInt8
-    "activations_symmetric": False,
-    "weights_dtype": "QInt8",  # QInt8, QUInt8
-    "weights_symmetric": True,
-    "per_channel": False,
-    "reduce_range": False,
-    "operators_to_quantize": [
-        "MatMul",
-        "Add",
-    ],
-}
-
-DEFAULT_CALIBRATION_CONFIG = {
-    "dataset_name": "glue",
-    "num_samples": 300,
-    "dataset_config_name": "sst2",
-    "dataset_split": "train",
-    "preprocess_batch": True,
-    "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor",
-}
-
-
-def infer_device_id(device: str) -> int:
-    """
-    Infer the device id from the given device string.
-    """
-
-    import torch
-
-    if device == "cuda":
-        return torch.cuda.current_device()
-    elif torch.device(device).type == "cuda":
-        return torch.device(device).index
-    elif torch.device(device).type == "cpu":
-        return -1
-    else:
-        raise ValueError(f"Unknown device '{device}'")
-
-
-def format_ort_quantization_dict(quantization_dict: Dict[str, Any]) -> None:
-    """
-    Format the quantization dictionary for onnxruntime.
-    """
-
-    from onnxruntime.quantization import QuantFormat, QuantizationMode, QuantType
-
-    if quantization_dict.get("format", None) is not None:
-        quantization_dict["format"] = QuantFormat.from_string(
-            quantization_dict["format"]
-        )
-    if quantization_dict.get("mode", None) is not None:
-        quantization_dict["mode"] = QuantizationMode.from_string(
-            quantization_dict["mode"]
-        )
-    if quantization_dict.get("activations_dtype", None) is not None:
-        quantization_dict["activations_dtype"] = QuantType.from_string(
-            quantization_dict["activations_dtype"]
-        )
-    if quantization_dict.get("weights_dtype", None) is not None:
-        quantization_dict["weights_dtype"] = QuantType.from_string(
-            quantization_dict["weights_dtype"]
-        )
-
-    return quantization_dict
diff --git a/optimum_benchmark/backends/utils/openvino_utils.py b/optimum_benchmark/backends/utils/openvino_utils.py
deleted file mode 100644
index 0f1037b77..000000000
--- a/optimum_benchmark/backends/utils/openvino_utils.py
+++ /dev/null
@@ -1,14 +0,0 @@
-DEFAULT_QUANTIZATION_CONFIG = {
-    "compression": None,
-    "input_info": None,
-    "save_onnx_model": False,
-}
-
-DEFAULT_CALIBRATION_CONFIG = {
-    "dataset_name": "glue",
-    "num_samples": 300,
-    "dataset_config_name": "sst2",
-    "dataset_split": "train",
-    "preprocess_batch": True,
-    "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor",
-}
diff --git a/optimum_benchmark/backends/utils/pytorch_utils.py b/optimum_benchmark/backends/utils/pytorch_utils.py
deleted file mode 100644
index 04a2dbecb..000000000
--- a/optimum_benchmark/backends/utils/pytorch_utils.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from logging import getLogger
-from typing import Optional
-import logging.config
-import os
-
-import torch
-from omegaconf import OmegaConf
-from torch.distributed.elastic.multiprocessing import Std
-
-OmegaConf.register_new_resolver("device_count", lambda: torch.cuda.device_count())
-
-
-DEFAULT_COMPILE_CONFIG = {
-    "fullgraph": False,
-    "dynamic": False,
-    "backend": "inductor",
-    "mode": None,
-    "options": None,
-    "disable": False,
-}
-
-# from https://github.com/pytorch/pytorch/blob/v2.0.0/torch/distributed/launcher/api.py#L29
-# adjusted to the defaults of torch.distributed.run
-# defined in https://github.com/pytorch/pytorch/blob/v2.0.0/torch/distributed/run.py#L770
-# TODO: decide wrther to use torch.distributed.run arguments or the ones from
-# torch.distributed.launcher.api
-DEFAULT_DDP_CONFIG = {
-    "min_nodes": 1,
-    "max_nodes": 1,
-    "run_id": "none",
-    "nproc_per_node": "${device_count:}",
-    "role": "default",
-    "rdzv_endpoint": "127.0.0.1:29500",
-    "rdzv_backend": "static",
-    "rdzv_configs": {
-        "timeout": 900,
-        "rank": 0,
-    },
-    "max_restarts": 0,
-    "monitor_interval": 5,
-    "start_method": "spawn",
-    "log_dir": None,
-    "metrics_cfg": {},
-    "local_addr": None,
-    "redirects": Std.NONE,
-    "tee": Std.NONE,
-}
-
-
-def randomize_weights(model):
-    for param in model.parameters():
-        if torch.cuda.is_available() and param.device.type == "cpu":
-            # we take advantage of the fact that a cuda device
-            # is available to use cuda kernels for randomization
-            # this is slower than asynchronous randomization while
-            # model is fully on gpu (because of data transfer) but
-            # faster than randomization while model is on cpu
-            param.data.cuda().normal_(mean=0.0, std=0.2).cpu()
-        else:
-            param.data.normal_(mean=0.0, std=0.2)
-
-
-def get_worker_logger(
-    name: Optional[str] = None,
-    log_all: bool = False,
-) -> logging.Logger:
-    """
-    PyTorch DDP subprocesses do not inherit from Hydra logger.
-    Thus, we need to reconfigure the logger for the workers.
-    """
-    if os.environ["RANK"] == "0" or log_all:
-        # TODO: also configure logging for other ranks
-        hydra_conf = OmegaConf.load(".hydra/hydra.yaml")
-        logging.config.dictConfig(
-            OmegaConf.to_container(hydra_conf.hydra.job_logging, resolve=True)
-        )
-
-    return getLogger(name)
diff --git a/optimum_benchmark/benchmarks/base.py b/optimum_benchmark/benchmarks/base.py
index da2721e5d..24cc27961 100644
--- a/optimum_benchmark/benchmarks/base.py
+++ b/optimum_benchmark/benchmarks/base.py
@@ -1,10 +1,10 @@
+from abc import ABC
 from dataclasses import dataclass
 from logging import getLogger
-from abc import ABC
+from typing import ClassVar, Generic, TypeVar
 
 from optimum_benchmark.backends.base import Backend
 
-
 LOGGER = getLogger("benchmark")
 
 
@@ -14,15 +14,19 @@ class BenchmarkConfig(ABC):
     _target_: str
 
 
-class Benchmark(ABC):
-    name: str
-    config: BenchmarkConfig
+BenchmarkConfigT = TypeVar("BenchmarkConfigT", bound=BenchmarkConfig)
+
+
+class Benchmark(Generic[BenchmarkConfigT], ABC):
+    NAME: ClassVar[str]
+
+    config: BenchmarkConfigT
 
     def __init__(self) -> None:
         pass
 
-    def configure(self, config: BenchmarkConfig) -> None:
-        LOGGER.info(f"Configuring {self.name} benchmark")
+    def configure(self, config: BenchmarkConfigT) -> None:
+        LOGGER.info(f"Configuring {self.NAME} benchmark")
         self.config = config
 
     def run(self, backend: Backend) -> None:
diff --git a/optimum_benchmark/benchmarks/inference.py b/optimum_benchmark/benchmarks/inference.py
index afded80c4..eadbc61c5 100644
--- a/optimum_benchmark/benchmarks/inference.py
+++ b/optimum_benchmark/benchmarks/inference.py
@@ -1,26 +1,18 @@
+import statistics
 from dataclasses import dataclass, field
-from typing import List, Dict, Optional
 from logging import getLogger
-from omegaconf import OmegaConf
-
+from typing import Any, Dict, List, Optional
 
+from omegaconf import OmegaConf
 from pandas import DataFrame
-import statistics
-
 
 from ..backends.base import Backend
-from .base import Benchmark, BenchmarkConfig
 from ..generators.input_generator import InputGenerator
-from ..utils import TEXT_GENERATION_TASKS, DIFFUSION_TASKS
-from ..trackers.memory import memory_tracker_class_for_backend
+from ..task_utils import DIFFUSION_TASKS, TEXT_GENERATION_TASKS
 from ..trackers.latency import latency_tracker_class_for_backend
-from .inference_utils import (
-    three_sig_figs,
-    DEFAULT_INPUT_SHAPES,
-    DEFAULT_GENERATE_KWARGS,
-    DEFAULT_DIFUSION_KWARGS,
-)
-
+from ..trackers.memory import memory_tracker_class_for_backend
+from .base import Benchmark, BenchmarkConfig
+from .utils import three_significant_digits_wrapper
 
 LOGGER = getLogger("inference")
 
@@ -33,6 +25,19 @@
     lambda task: task in DIFFUSION_TASKS,
 )
 
+GENERATE_CONFIG = {
+    "max_new_tokens": 100,
+    "min_new_tokens": 100,
+    "do_sample": False,
+    "use_cache": True,
+    "pad_token_id": 0,
+    "num_beams": 1,
+}
+
+DIFUSION_CONFIG = {
+    "num_images_per_prompt": 1,
+}
+
 
 @dataclass
 class InferenceConfig(BenchmarkConfig):
@@ -41,14 +46,25 @@ class InferenceConfig(BenchmarkConfig):
 
     # benchmark options
     memory: bool = False
-    warmup_runs: int = 10
     duration: int = 10
-    # TODO: deprecate this and use `benchmark.duration`
+    warmup_runs: int = 10
     benchmark_duration: Optional[int] = None
 
     # input options
     input_shapes: Dict = field(
-        default_factory=lambda: DEFAULT_INPUT_SHAPES,
+        default_factory=lambda: {
+            # used with all tasks
+            "batch_size": 2,
+            # used with text input tasks
+            "sequence_length": 16,
+            # used with multiple choice tasks where input
+            # is of shape (batch_size, num_choices, sequence_length)
+            "num_choices": 1,
+            # used with audio input tasks
+            "feature_size": 80,
+            "nb_max_frames": 3000,
+            "audio_sequence_length": 16000,
+        },
     )
 
     # TODO: deprecate this and use `benchamrk.generate_kwargs`
@@ -56,54 +72,40 @@ class InferenceConfig(BenchmarkConfig):
 
     # forward options
     can_diffuse: bool = "${can_diffuse:${task}}"
-    forward_kwargs: Optional[Dict] = None
+    forward_kwargs: Dict[str, Any] = field(default_factory=dict)
 
     # generation options
     can_generate: bool = "${can_generate:${task}}"
-    generate_kwargs: Optional[Dict] = None
+    generate_kwargs: Dict[str, Any] = field(default_factory=dict)
 
     def __post_init__(self):
+        if self.can_diffuse:
+            self.forward_kwargs = OmegaConf.to_container(OmegaConf.merge(self.forward_kwargs, DIFUSION_CONFIG))
+
         if self.can_generate:
-            self.generate_kwargs = OmegaConf.merge(
-                self.generate_kwargs or {},
-                DEFAULT_GENERATE_KWARGS,
-            )
+            self.generate_kwargs = OmegaConf.to_container(OmegaConf.merge(self.generate_kwargs, GENERATE_CONFIG))
 
-        if self.can_diffuse:
-            self.forward_kwargs = OmegaConf.merge(
-                self.forward_kwargs or {},
-                DEFAULT_DIFUSION_KWARGS,
-            )
+            if self.generate_kwargs["max_new_tokens"] != self.generate_kwargs["min_new_tokens"]:
+                raise ValueError("`max_new_tokens` and `min_new_tokens` must be equal for fixed length output.")
 
         if self.new_tokens is not None:
             LOGGER.warning(
-                "The `new_tokens` option is deprecated, please use `generate_kwargs` "
-                "instead. `max_new_tokens` and `min_new_tokens` will be set to the "
-                "value of `new_tokens`."
+                "The `new_tokens` option is deprecated, please use `generate_kwargs` instead. "
+                "`generate_kwargs.max_new_tokens` and `generate_kwargs.min_new_tokens` will be set to the value of `new_tokens`."
             )
             self.generate_kwargs["max_new_tokens"] = self.new_tokens
             self.generate_kwargs["min_new_tokens"] = self.new_tokens
 
-        if self.generate_kwargs is not None:
-            assert (
-                self.generate_kwargs["max_new_tokens"]
-                == self.generate_kwargs["min_new_tokens"]
-            ), (
-                "`max_new_tokens` and `min_new_tokens` "
-                "must be equal for fixed length output"
-            )
-
-        if self.benchmark_duration is not None:
+        if self.benchmark_duration:
             LOGGER.warning(
-                "The `benchmark_duration` option is deprecated, please use `duration` "
-                "instead. `duration` will be set to the value of `benchmark_duration`."
+                "The `benchmark_duration` option is deprecated, please use `duration` instead. "
+                "`duration` will be set to the value of `benchmark_duration`."
             )
             self.duration = self.benchmark_duration
 
 
-class InferenceBenchmark(Benchmark):
-    name: str = "inference"
-    config: InferenceConfig
+class InferenceBenchmark(Benchmark[InferenceConfig]):
+    NAME = "inference"
 
     def __init__(self):
         # initialize inference results
@@ -114,12 +116,6 @@ def __init__(self):
     def configure(self, config: InferenceConfig):
         super().configure(config)
 
-        if self.config.forward_kwargs is None:
-            self.config.forward_kwargs = {}
-
-        if self.config.generate_kwargs is None:
-            self.config.generate_kwargs = {}
-
     def run(self, backend: Backend) -> None:
         LOGGER.info("Running inference benchmark")
         self.config.input_shapes.update(backend.model_shapes)
@@ -130,10 +126,6 @@ def run(self, backend: Backend) -> None:
             pretrained_config=backend.pretrained_config,
         )
 
-        if self.config.memory:
-            # if requested, run memory tracking
-            self.run_memory_tracking(backend)
-
         # run forward pass tracking
         self.run_forward_tracking(backend)
 
@@ -141,32 +133,12 @@ def run(self, backend: Backend) -> None:
             # if possible, run generation pass tracking
             self.run_generate_tracking(backend)
 
-    def run_memory_tracking(self, backend: Backend) -> None:
-        memory_input = self.input_generator.generate(
-            mode="forward",
-        )
-
-        for key, value in memory_input.items():
-            if key == "prompt":
-                continue
-            memory_input[key] = value.to(backend.device)
-
-        # for backends that require compilation with static shapes
-        backend.prepare_for_inference(input_shapes=self.config.input_shapes)
-
-        LOGGER.info("\t+ Tracking forward pass peak memory")
-        memory_tracker = memory_tracker_class_for_backend[backend.config.name](backend)
-        with memory_tracker.track(interval=self.config.duration // 100):
-            _ = backend.forward(memory_input)
-
-        self.forward_peak_memory = memory_tracker.get_peak_memory()
-        LOGGER.info(f"\t+ Forward pass peak memory: {self.forward_peak_memory} (MB)")
-
     def run_forward_tracking(self, backend: Backend) -> None:
         forward_input = self.input_generator.generate(
             mode="forward",
         )
 
+        # TODO: can be handled by the backend later
         for key, value in forward_input.items():
             if key == "prompt":
                 continue
@@ -180,24 +152,30 @@ def run_forward_tracking(self, backend: Backend) -> None:
             _ = backend.forward(forward_input, **self.config.forward_kwargs)
 
         LOGGER.info("\t+ Tracking forward pass latency and throughput")
-        latency_tracker = latency_tracker_class_for_backend[backend.config.name](
-            backend
-        )
+        latency_tracker = latency_tracker_class_for_backend[backend.config.name](backend)
         while sum(self.forward_latencies) < self.config.duration:
             with latency_tracker.track():
                 _ = backend.forward(forward_input, **self.config.forward_kwargs)
             self.forward_latencies = latency_tracker.get_latencies()
 
         LOGGER.info(f"\t+ Forward pass latency: {self.forward_latency:.2e} (s)")
-        LOGGER.info(
-            f"\t+ Forward pass throughput: {self.forward_throughput:.2f} (samples/s)"
-        )
+        LOGGER.info(f"\t+ Forward pass throughput: {self.forward_throughput:.2f} (samples/s)")
+
+        if self.config.memory:
+            LOGGER.info("\t+ Tracking forward pass peak memory")
+            memory_tracker = memory_tracker_class_for_backend[backend.config.name](backend)
+            with memory_tracker.track(interval=self.config.duration // 100):
+                _ = backend.forward(forward_input)
+
+            self.forward_peak_memory = memory_tracker.get_peak_memory()
+            LOGGER.info(f"\t+ Forward pass peak memory: {self.forward_peak_memory} (MB)")
 
     def run_generate_tracking(self, backend: Backend) -> None:
         generate_input = self.input_generator.generate(
-            mode="forward",
+            mode="generate",
         )
 
+        # TODO: can be handled by the backend later
         for key, value in generate_input.items():
             if key == "prompt":
                 continue
@@ -210,9 +188,7 @@ def run_generate_tracking(self, backend: Backend) -> None:
         )
 
         LOGGER.info("\t+ Tracking generation latency and throughput")
-        latency_tracker = latency_tracker_class_for_backend[backend.config.name](
-            backend
-        )
+        latency_tracker = latency_tracker_class_for_backend[backend.config.name](backend)
         while sum(self.generate_latencies) < self.config.duration:
             with latency_tracker.track():
                 _ = backend.generate(
@@ -222,35 +198,33 @@ def run_generate_tracking(self, backend: Backend) -> None:
             self.generate_latencies = latency_tracker.get_latencies()
 
         LOGGER.info(f"\t+ Generation pass latency: {self.generate_latency:.2e} (s)")
-
-        LOGGER.info(
-            f"\t+ Generation pass throughput: {self.generate_throughput:.2f} (tokens/s)"
-        )
+        LOGGER.info(f"\t+ Generation pass throughput: {self.generate_throughput:.2f} (tokens/s)")
 
     # Metrics
     @property
-    @three_sig_figs
+    @three_significant_digits_wrapper
     def forward_latency(self) -> float:
         return statistics.mean(self.forward_latencies)
 
     @property
-    @three_sig_figs
+    @three_significant_digits_wrapper
     def forward_throughput(self) -> float:
-        return (
-            self.config.input_shapes["batch_size"]
-            * self.config.forward_kwargs["num_images_per_prompt"]
-            / self.forward_latency
-            if self.config.can_diffuse
-            else self.config.input_shapes["batch_size"] / self.forward_latency
-        )
+        if self.config.can_diffuse:
+            return (
+                self.config.input_shapes["batch_size"]
+                * self.config.forward_kwargs["num_images_per_prompt"]
+                / self.forward_latency
+            )
+        else:
+            return self.config.input_shapes["batch_size"] / self.forward_latency
 
     @property
-    @three_sig_figs
+    @three_significant_digits_wrapper
     def generate_latency(self) -> float:
         return statistics.mean(self.generate_latencies)
 
     @property
-    @three_sig_figs
+    @three_significant_digits_wrapper
     def generate_throughput(self) -> float:
         return (
             self.config.generate_kwargs["min_new_tokens"]
@@ -259,14 +233,18 @@ def generate_throughput(self) -> float:
         )
 
     def get_results_df(self) -> DataFrame:
-        results_dict = dict()
+        results_dict = {}
+
+        results_dict["forward.latency(s)"] = self.forward_latency
+
+        if self.config.can_diffuse:
+            results_dict["forward.throughput(images/s)"] = self.forward_throughput
+        else:
+            results_dict["forward.throughput(samples/s)"] = self.forward_throughput
 
         if self.config.memory:
             results_dict["forward.peak_memory(MB)"] = self.forward_peak_memory
 
-        results_dict["forward.latency(s)"] = self.forward_latency
-        results_dict["forward.throughput(samples/s)"] = self.forward_throughput
-
         if self.config.can_generate:
             results_dict["generate.latency(s)"] = self.generate_latency
             results_dict["generate.throughput(tokens/s)"] = self.generate_throughput
diff --git a/optimum_benchmark/benchmarks/inference_utils.py b/optimum_benchmark/benchmarks/inference_utils.py
deleted file mode 100644
index b2280cdc3..000000000
--- a/optimum_benchmark/benchmarks/inference_utils.py
+++ /dev/null
@@ -1,37 +0,0 @@
-DEFAULT_GENERATE_KWARGS = {
-    "max_new_tokens": 100,
-    "min_new_tokens": 100,
-    "do_sample": False,
-    "use_cache": True,
-    "pad_token_id": 0,
-    "num_beams": 1,
-}
-
-DEFAULT_DIFUSION_KWARGS = {
-    "num_images_per_prompt": 1,
-}
-
-DEFAULT_INPUT_SHAPES = {
-    # used with all tasks
-    "batch_size": 2,
-    # used with text input tasks
-    "sequence_length": 16,
-    # used with multiple choice tasks where input
-    # is of shape (batch_size, num_choices, sequence_length)
-    "num_choices": 1,
-    # used with audio input tasks
-    "feature_size": 80,
-    "nb_max_frames": 3000,
-    "audio_sequence_length": 16000,
-}
-
-
-def format_float(x: float) -> float:
-    return float(f"{x:.3g}")
-
-
-def three_sig_figs(func):
-    def wrapper(*args, **kwargs):
-        return format_float(func(*args, **kwargs))
-
-    return wrapper
diff --git a/optimum_benchmark/benchmarks/training.py b/optimum_benchmark/benchmarks/training.py
index 6ba1ab20b..84b0be949 100644
--- a/optimum_benchmark/benchmarks/training.py
+++ b/optimum_benchmark/benchmarks/training.py
@@ -1,15 +1,14 @@
-from typing import Any, Dict
 from dataclasses import dataclass, field
 from logging import getLogger
+from typing import Any, Dict
 
 from omegaconf import OmegaConf
 from pandas import DataFrame
 
 from ..backends.base import Backend
-from .base import Benchmark, BenchmarkConfig
 from ..generators.dataset_generator import DatasetGenerator
-from .training_utils import MeasurementCallback, get_data_collator
-
+from .base import Benchmark, BenchmarkConfig
+from .utils import MeasurementCallback, get_data_collator
 
 LOGGER = getLogger("training")
 
@@ -23,7 +22,7 @@ class TrainingConfig(BenchmarkConfig):
     _target_: str = "optimum_benchmark.benchmarks.training.TrainingBenchmark"
 
     # training options
-    warmup_steps: int = 2
+    warmup_steps: int = 10
 
     # dataset options
     dataset_shapes: Dict = field(
@@ -47,7 +46,8 @@ class TrainingConfig(BenchmarkConfig):
         default_factory=lambda: {
             # these are arguments that we set by default
             # but can be overwritten by the user
-            "skip_memory_metrics": False,
+            "skip_memory_metrics": True,
+            # memory metrics are wrong when using multiple processes
             "output_dir": "./trainer_output",
             "use_cpu": "${is_cpu:${device}}",
             "ddp_find_unused_parameters": False,
@@ -58,9 +58,8 @@ class TrainingConfig(BenchmarkConfig):
     )
 
 
-class TrainingBenchmark(Benchmark):
-    name: str = "training"
-    config: TrainingConfig
+class TrainingBenchmark(Benchmark[TrainingConfig]):
+    NAME = "training"
 
     def __init__(self):
         # initialize training results
@@ -88,14 +87,14 @@ def run(self, backend: "Backend") -> None:
 
         self.training_metrics = {
             # warmup metrics
-            "warmup_runtime": trainer_state.warmup_runtime,
-            "warmup_throughput()": trainer_state.warmup_samples_per_second,
+            "warmup.runtime(s)": trainer_state.warmup_runtime,
+            "warmup.throughput(samples/s)": trainer_state.warmup_samples_per_second,
             # training metrics
-            "train_runtime": trainer_state.train_runtime,
-            "training_throughput": trainer_state.train_samples_per_second,
+            "training.runtime(s)": trainer_state.training_runtime,
+            "training.throughput(samples/s)": trainer_state.training_samples_per_second,
             # overall training metrics
-            "overall_train_runtime": trainer_state.overall_train_runtime,
-            "overall_training_throughput": trainer_state.overall_train_samples_per_second,
+            "overall_training.runtime(s)": trainer_state.overall_training_runtime,
+            "overall_training.throughput(samles/s)": (trainer_state.overall_training_samples_per_second),
         }
 
     def get_results_df(self) -> DataFrame:
diff --git a/optimum_benchmark/benchmarks/training_utils.py b/optimum_benchmark/benchmarks/training_utils.py
deleted file mode 100644
index 097e06c22..000000000
--- a/optimum_benchmark/benchmarks/training_utils.py
+++ /dev/null
@@ -1,103 +0,0 @@
-from typing import Any, Dict, TYPE_CHECKING
-from dataclasses import dataclass
-import time
-
-from transformers import default_data_collator
-from transformers import TrainerCallback
-
-if TYPE_CHECKING:
-    from transformers import TrainerState, TrainingArguments, TrainerControl
-
-
-@dataclass
-class MeasurementCallback(TrainerCallback):
-    warmup_steps: int
-
-    def on_train_begin(
-        self,
-        args: "TrainingArguments",
-        state: "TrainerState",
-        control: "TrainerControl",
-        **kwargs,
-    ):
-        if state.max_steps <= self.warmup_steps:
-            # This check is here because max_steps is set only once the training
-            # is launched, thus we can not check before calling trainer.train().
-            raise ValueError(
-                f"Total training steps {state.max_steps} is smaller "
-                "than the number of warmup steps {self.warmup_steps}. "
-                "Please increase the total number of steps (for example by "
-                "increasing the dataset size)."
-            )
-
-        state.warmup_start = time.time_ns() * 1e-9
-        state.overall_train_start = time.time_ns() * 1e-9
-
-    def on_step_begin(
-        self,
-        args: "TrainingArguments",
-        state: "TrainerState",
-        control: "TrainerControl",
-        **kwargs,
-    ):
-        if state.global_step == self.warmup_steps:
-            state.warmup_end = time.time_ns() * 1e-9
-            state.training_start = time.time_ns() * 1e-9
-        elif state.global_step > state.max_steps - 1:
-            raise ValueError("global_step > state.max_steps - 1")
-
-    def on_train_end(
-        self,
-        args: "TrainingArguments",
-        state: "TrainerState",
-        control: "TrainerControl",
-        **kwargs,
-    ):
-        state.training_end = time.time_ns() * 1e-9
-        state.overall_train_end = time.time_ns() * 1e-9
-
-        state.total_train_batch_size = (
-            args.train_batch_size * args.gradient_accumulation_steps * args.world_size
-        )
-
-        # warmup metrics
-        state.warmup_runtime = state.warmup_end - state.warmup_start
-        state.num_warmup_samples = self.warmup_steps * state.total_train_batch_size
-        state.warmup_samples_per_second = (
-            state.num_warmup_samples / state.warmup_runtime
-        )
-        # state.warmup_steps_per_second = self.warmup_steps / state.warmup_runtime
-
-        # training metrics
-        state.train_runtime = state.training_end - state.training_start
-        state.num_train_steps = state.max_steps - self.warmup_steps
-        state.num_train_samples = state.num_train_steps * state.total_train_batch_size
-        state.train_samples_per_second = state.num_train_samples / state.train_runtime
-        # state.train_steps_per_second = state.num_train_steps / state.train_runtime
-
-        # overall training metrics
-        state.overall_train_runtime = state.training_end - state.warmup_start
-        state.overall_train_samples_per_second = (
-            state.num_train_samples / state.overall_train_runtime
-        )
-        # state.overall_train_steps_per_second = (
-        #     state.num_train_steps / state.overall_train_runtime
-        # )
-
-
-def get_data_collator(task: str) -> callable:
-    if task == "object-detection":
-        return object_detection_data_collator
-    else:
-        return default_data_collator
-
-
-def object_detection_data_collator(batch) -> Dict[str, Any]:
-    import torch
-
-    pixel_values = torch.stack([example["pixel_values"] for example in batch])
-    labels = [example["labels"] for example in batch]
-    return {
-        "pixel_values": pixel_values,
-        "labels": labels,
-    }
diff --git a/optimum_benchmark/benchmarks/utils.py b/optimum_benchmark/benchmarks/utils.py
new file mode 100644
index 000000000..973274303
--- /dev/null
+++ b/optimum_benchmark/benchmarks/utils.py
@@ -0,0 +1,87 @@
+import time
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Callable, Dict
+
+from transformers import TrainerCallback, default_data_collator
+
+if TYPE_CHECKING:
+    from transformers import TrainerControl, TrainerState, TrainingArguments
+
+
+def extract_three_significant_digits(x: float) -> float:
+    return float(f"{x:.3g}")
+
+
+def three_significant_digits_wrapper(func: Callable[..., float]) -> Callable[..., float]:
+    def wrapper(*args, **kwargs):
+        return extract_three_significant_digits(func(*args, **kwargs))
+
+    return wrapper
+
+
+@dataclass
+class MeasurementCallback(TrainerCallback):
+    warmup_steps: int
+
+    def on_train_begin(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if state.max_steps <= self.warmup_steps:
+            # This check is here because max_steps is set only once the training
+            # is launched, thus we can not check before calling trainer.train().
+            raise ValueError(
+                f"Total training steps {state.max_steps} is smaller "
+                "than the number of warmup steps {self.warmup_steps}. "
+                "Please increase the total number of steps (for example by "
+                "increasing the dataset size)."
+            )
+
+        state.warmup_start = time.time_ns() * 1e-9
+        state.overall_training_start = time.time_ns() * 1e-9
+
+    def on_step_begin(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if state.global_step == self.warmup_steps:
+            state.warmup_end = time.time_ns() * 1e-9
+            state.training_start = time.time_ns() * 1e-9
+        elif state.global_step > state.max_steps - 1:
+            raise ValueError("global_step > state.max_steps - 1")
+
+    def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        state.training_end = time.time_ns() * 1e-9
+        state.overall_training_end = time.time_ns() * 1e-9
+
+        state.total_training_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size
+
+        # warmup metrics
+        state.warmup_runtime = state.warmup_end - state.warmup_start
+        state.num_warmup_samples = self.warmup_steps * state.total_training_batch_size
+        state.warmup_samples_per_second = state.num_warmup_samples / state.warmup_runtime
+        state.warmup_steps_per_second = self.warmup_steps / state.warmup_runtime
+
+        # training metrics
+        state.training_runtime = state.training_end - state.training_start
+        state.num_training_steps = state.max_steps - self.warmup_steps
+        state.num_training_samples = state.num_training_steps * state.total_training_batch_size
+        state.training_samples_per_second = state.num_training_samples / state.training_runtime
+        state.training_steps_per_second = state.num_training_steps / state.training_runtime
+
+        # overall training metrics
+        state.overall_training_runtime = state.training_end - state.warmup_start
+        state.overall_training_samples_per_second = state.num_training_samples / state.overall_training_runtime
+        state.overall_training_steps_per_second = state.num_training_steps / state.overall_training_runtime
+
+
+def get_data_collator(task: str) -> callable:
+    if task == "object-detection":
+        return object_detection_data_collator
+    else:
+        return default_data_collator
+
+
+def object_detection_data_collator(batch) -> Dict[str, Any]:
+    import torch
+
+    pixel_values = torch.stack([example["pixel_values"] for example in batch])
+    labels = [example["labels"] for example in batch]
+    return {
+        "pixel_values": pixel_values,
+        "labels": labels,
+    }
diff --git a/optimum_benchmark/env_utils.py b/optimum_benchmark/env_utils.py
new file mode 100644
index 000000000..dd496fb49
--- /dev/null
+++ b/optimum_benchmark/env_utils.py
@@ -0,0 +1,38 @@
+import platform
+import re
+import subprocess
+from logging import getLogger
+from typing import Optional
+
+import psutil
+
+LOGGER = getLogger("utils")
+
+
+def bytes_to_mega_bytes(bytes: int) -> int:
+    # Reference: https://en.wikipedia.org/wiki/Byte#Multiple-byte_units
+    return int(bytes * 1e-6)
+
+
+def get_cpu() -> Optional[str]:
+    if platform.system() == "Windows":
+        return platform.processor()
+
+    elif platform.system() == "Darwin":
+        command = "sysctl -n machdep.cpu.brand_string"
+        return str(subprocess.check_output(command).strip())
+
+    elif platform.system() == "Linux":
+        command = "cat /proc/cpuinfo"
+        all_info = subprocess.check_output(command, shell=True).decode().strip()
+        for line in all_info.split("\n"):
+            if "model name" in line:
+                return re.sub(".*model name.*:", "", line, 1)
+        return "Could not find device name"
+
+    else:
+        raise ValueError(f"Unknown system '{platform.system()}'")
+
+
+def get_cpu_ram_mb():
+    return bytes_to_mega_bytes(psutil.virtual_memory().total)
diff --git a/optimum_benchmark/experiment.py b/optimum_benchmark/experiment.py
index a33f1026b..85253f80d 100644
--- a/optimum_benchmark/experiment.py
+++ b/optimum_benchmark/experiment.py
@@ -1,31 +1,28 @@
 import os
 import platform
-from typing import Any, Type, Dict
+from dataclasses import dataclass, field
 from logging import getLogger
-from dataclasses import dataclass, MISSING, field
+from typing import Any, Dict, Type
 
 import hydra
+from accelerate import __version__ as accelerate_version
+from diffusers import __version__ as diffusers_version
+from hydra.core.config_store import ConfigStore
 from hydra.utils import get_class
+from omegaconf import DictConfig, OmegaConf, SCMode
 from optimum.exporters import TasksManager
-from omegaconf import DictConfig, OmegaConf
-from hydra.core.config_store import ConfigStore
-from diffusers import __version__ as diffusers_version
-from accelerate import __version__ as accelerate_version
 from optimum.version import __version__ as optimum_version
 from transformers import __version__ as transformers_version
 
-from .import_utils import (
-    is_torch_available,
-    is_onnxruntime_available,
-    is_openvino_available,
-    is_neural_compressor_available,
-)
 from .backends.base import Backend
+from .backends.neural_compressor.config import INCConfig
+from .backends.onnxruntime.config import ORTConfig
+from .backends.openvino.config import OVConfig
+from .backends.pytorch.config import PyTorchConfig
 from .benchmarks.base import Benchmark
-from .utils import get_cpu, get_cpu_ram_mb
-from .benchmarks.training import TrainingConfig
 from .benchmarks.inference import InferenceConfig
-
+from .benchmarks.training import TrainingConfig
+from .env_utils import get_cpu, get_cpu_ram_mb
 
 LOGGER = getLogger("experiment")
 
@@ -49,13 +46,13 @@ class ExperimentConfig:
     benchmark: Any  # https://github.com/facebookresearch/hydra/issues/1722#issuecomment-883568386
 
     # EXPERIMENT CONFIGURATION
-    experiment_name: str = MISSING
+    experiment_name: str
     # Model name or path (bert-base-uncased, google/vit-base-patch16-224, ...)
-    model: str = MISSING
+    model: str
     # Device name or path (cpu, cuda, cuda:0, ...)
-    device: str = MISSING
+    device: str
     # Task name (text-classification, image-classification, ...)
-    task: str = "${infer_task:${model}, ${hub_kwargs.revision}}"
+    task: str = "${infer_task:${model},${hub_kwargs.revision}}"
 
     # ADDITIONAL MODEL CONFIGURATION: Model revision, use_auth_token, trust_remote_code
     hub_kwargs: Dict = field(
@@ -68,6 +65,7 @@ class ExperimentConfig:
     )
 
     # ENVIRONMENT CONFIGURATION
+    # TODO: add gpu info when available
     environment: Dict = field(
         default_factory=lambda: {
             "optimum_version": optimum_version,
@@ -86,38 +84,17 @@ class ExperimentConfig:
 # Register configurations
 cs = ConfigStore.instance()
 cs.store(name="experiment", node=ExperimentConfig)
-
-if is_torch_available():
-    from optimum_benchmark.backends.pytorch import PyTorchConfig
-
-    cs.store(group="backend", name="pytorch", node=PyTorchConfig)
-
-if is_onnxruntime_available():
-    from optimum_benchmark.backends.onnxruntime import ORTConfig
-
-    cs.store(group="backend", name="onnxruntime", node=ORTConfig)
-
-if is_openvino_available():
-    from optimum_benchmark.backends.openvino import OVConfig
-
-    cs.store(group="backend", name="openvino", node=OVConfig)
-
-if is_neural_compressor_available():
-    from optimum_benchmark.backends.neural_compressor import INCConfig
-
-    cs.store(group="backend", name="neural_compressor", node=INCConfig)
-
+cs.store(group="backend", name="pytorch", node=PyTorchConfig)
+cs.store(group="backend", name="onnxruntime", node=ORTConfig)
+cs.store(group="backend", name="openvino", node=OVConfig)
+cs.store(group="backend", name="neural_compressor", node=INCConfig)
 cs.store(group="benchmark", name="inference", node=InferenceConfig)
 cs.store(group="benchmark", name="training", node=TrainingConfig)
 
 
 @hydra.main(version_base=None)
 def run_experiment(experiment: DictConfig) -> None:
-    from omegaconf import SCMode
-
-    experiment = OmegaConf.to_container(
-        experiment, structured_config_mode=SCMode.INSTANTIATE
-    )
+    experiment = OmegaConf.to_container(experiment, structured_config_mode=SCMode.INSTANTIATE, resolve=True)
 
     # Save the config
     OmegaConf.save(experiment, "hydra_config.yaml", resolve=True)
@@ -130,21 +107,23 @@ def run_experiment(experiment: DictConfig) -> None:
     # Allocate requested backend
     backend_factory: Type[Backend] = get_class(experiment.backend._target_)
     backend: Backend = backend_factory(
-        experiment.model,
-        experiment.task,
-        experiment.device,
-        experiment.hub_kwargs,
+        task=experiment.task,
+        model=experiment.model,
+        device=experiment.device,
+        hub_kwargs=experiment.hub_kwargs,
     )
 
     try:
+        # Configure the backend
         backend.configure(experiment.backend)
-
+        # Run the benchmark
         benchmark.run(backend)
         # Save the benchmark results
         benchmark.save()
-
+        # Clean up the backend
         backend.clean()
+
     except Exception as e:
-        LOGGER.error("Error during benchmarking: %s", e)
+        LOGGER.error("Error during experiment: %s", e)
         backend.clean()
         raise e
diff --git a/optimum_benchmark/generators/dataset_generator.py b/optimum_benchmark/generators/dataset_generator.py
index 0d5f00e68..e6a9df36f 100644
--- a/optimum_benchmark/generators/dataset_generator.py
+++ b/optimum_benchmark/generators/dataset_generator.py
@@ -8,18 +8,13 @@
     TaskGenerator,
 )
 
-
 LOGGER = getLogger("dataset_generator")
 
 
 class DatasetGenerator:
     task_generator: TaskGenerator
 
-    def __init__(
-        self,
-        task: str,
-        dataset_shapes: Dict[str, int],
-    ):
+    def __init__(self, task: str, dataset_shapes: Dict[str, int]):
         dataset_shapes["batch_size"] = dataset_shapes.pop("dataset_size")
 
         if task in TASKS_TO_GENERATORS:
@@ -40,7 +35,7 @@ def generate(self) -> Dataset:
         task_dataset = self.task_generator.generate()
         task_dataset = Dataset.from_dict(task_dataset)
         task_dataset.set_format(
-            type="numpy",
+            type="torch",  # for now we're using pytorch tensors
             columns=list(task_dataset.features.keys()),
         )
 
diff --git a/optimum_benchmark/generators/input_generator.py b/optimum_benchmark/generators/input_generator.py
index f384abb23..f9858dac3 100644
--- a/optimum_benchmark/generators/input_generator.py
+++ b/optimum_benchmark/generators/input_generator.py
@@ -1,9 +1,9 @@
-from typing import Dict, List, Union, Optional, TYPE_CHECKING
 from logging import getLogger
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
 if TYPE_CHECKING:
-    from transformers import PretrainedConfig
     import torch
+    from transformers import PretrainedConfig
 
 from optimum_benchmark.generators.model_type_generator import (
     SUPPURTED_MODEL_TYPES,
@@ -14,32 +14,28 @@
     TaskGenerator,
 )
 
-
 LOGGER = getLogger("input_generator")
 
 
 class InputGenerator:
-    model_type_generator: Optional[ModelTypeGenerator] = None
-    task_generator: Optional[TaskGenerator] = None
+    model_type_generator: Optional[ModelTypeGenerator]
+    task_generator: Optional[TaskGenerator]
 
     def __init__(
-        self,
-        task: str,
-        input_shapes: Dict[str, int],
-        # for model_type_generator
-        pretrained_config: Optional["PretrainedConfig"] = None,
+        self, task: str, input_shapes: Dict[str, int], pretrained_config: Optional["PretrainedConfig"] = None
     ):
-        if pretrained_config is not None:
+        if pretrained_config is not None and pretrained_config.model_type in SUPPURTED_MODEL_TYPES:
+            self.used_generator = "model_type"
             model_type = pretrained_config.model_type
-            if ModelTypeGenerator.check_model_type_support(model_type):
-                LOGGER.info(f"Using {model_type} model type generator")
-                self.model_type_generator = ModelTypeGenerator(
-                    task=task,
-                    model_type=model_type,
-                    shapes=input_shapes,
-                    pretrained_config=pretrained_config,
-                )
+            LOGGER.info(f"Using {model_type} model type generator")
+            self.model_type_generator = ModelTypeGenerator(
+                task=task,
+                model_type=model_type,
+                shapes=input_shapes,
+                pretrained_config=pretrained_config,
+            )
         elif task in TASKS_TO_GENERATORS:
+            self.used_generator = "task"
             LOGGER.info(f"Using {task} task generator")
             self.task_generator = TASKS_TO_GENERATORS[task](
                 shapes=input_shapes,
@@ -59,18 +55,13 @@ def __init__(
     # TODO: we can drop the torch dependency here by returning a dict of numpy arrays
     # and then converting them to torch tensors in backend.prepare_for_inference
     def generate(self, mode: str) -> Dict[str, Union["torch.Tensor", List[str]]]:
-        if self.model_type_generator is not None:
+        if self.used_generator == "model_type":
             dummy_input = self.model_type_generator.generate()
-        elif self.task_generator is not None:
+        elif self.used_generator == "task":
             dummy_input = self.task_generator.generate()
 
         if mode == "generate":
-            if "input_ids" in dummy_input:
-                # text input
-                dummy_input = {
-                    "input_ids": dummy_input["input_ids"],
-                }
-            elif "pixel_values" in dummy_input:
+            if "pixel_values" in dummy_input:
                 # image input
                 dummy_input = {
                     "pixel_values": dummy_input["pixel_values"],
@@ -85,5 +76,10 @@ def generate(self, mode: str) -> Dict[str, Union["torch.Tensor", List[str]]]:
                 dummy_input = {
                     "input_features": dummy_input["input_features"],
                 }
+            elif "input_ids" in dummy_input:
+                # text input
+                dummy_input = {
+                    "input_ids": dummy_input["input_ids"],
+                }
 
         return dummy_input
diff --git a/optimum_benchmark/generators/model_type_generator.py b/optimum_benchmark/generators/model_type_generator.py
index 8ca800ac9..d06b512d7 100644
--- a/optimum_benchmark/generators/model_type_generator.py
+++ b/optimum_benchmark/generators/model_type_generator.py
@@ -1,9 +1,8 @@
-from typing import Dict, List
 from logging import getLogger
+from typing import Dict, List
 
-from transformers import PretrainedConfig
 from optimum.exporters.tasks import TasksManager
-
+from transformers import PretrainedConfig
 
 LOGGER = getLogger("model_type_generator")
 
@@ -11,8 +10,8 @@
 
 
 class ModelTypeGenerator:
-    """
-    A wrapper around optimum's TasksManager to generate dummy inputs for a given model type.
+    """A wrapper around optimum's TasksManager to generate dummy inputs
+    for a given model type.
     """
 
     def __init__(
@@ -30,29 +29,5 @@ def __init__(
             model_type=model_type,
         )(pretrained_config)
 
-    @staticmethod
-    def check_model_type_support(model_type: str) -> bool:
-        return model_type in SUPPURTED_MODEL_TYPES
-
     def generate(self) -> Dict[str, int]:
         return self.onnx_config.generate_dummy_inputs(framework="pt", **self.shapes)
-
-
-if __name__ == "__main__":
-    from transformers import AutoConfig
-
-    pretrained_config = AutoConfig.from_pretrained("gpt2")
-
-    assert ModelTypeGenerator.check_model_type_support("gpt2")
-
-    model_input_generator = ModelTypeGenerator(
-        task="text-generation",
-        model_type="gpt2",
-        shapes={
-            "batch_size": 1,
-            "sequence_length": 100,
-        },
-        pretrained_config=pretrained_config,
-    )
-
-    print(model_input_generator.generate())
diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py
index ec6ad9f62..c63ab565f 100644
--- a/optimum_benchmark/generators/task_generator.py
+++ b/optimum_benchmark/generators/task_generator.py
@@ -1,10 +1,9 @@
+from abc import ABC
 from logging import getLogger
 from typing import Tuple
-from abc import ABC
 
 import torch
 
-
 LOGGER = getLogger("task_generator")
 
 
@@ -377,27 +376,3 @@ def generate(self):
     "stable-diffusion": PromptGenerator,
     "stable-diffusion-xl": PromptGenerator,
 }
-
-
-if __name__ == "__main__":
-    all_shapes = {
-        "batch_size": 1,
-        "sequence_length": 16,
-        "num_choices": 2,
-        "feature_size": 80,
-        "nb_max_frames": 3000,
-        "audio_sequence_length": 16000,
-        "height": 224,
-        "width": 224,
-        "num_labels": 2,
-        "num_queries": 2,
-        "vocab_size": 100,
-        "type_vocab_size": 2,
-        "num_channels": 3,
-    }
-
-    for task in TASKS_TO_GENERATORS:
-        task_input_generator = TASKS_TO_GENERATORS[task](
-            shapes=all_shapes, with_labels=True
-        )
-        print(task_input_generator.generate())
diff --git a/optimum_benchmark/import_utils.py b/optimum_benchmark/import_utils.py
index cc97c5397..13dee6171 100644
--- a/optimum_benchmark/import_utils.py
+++ b/optimum_benchmark/import_utils.py
@@ -3,9 +3,7 @@
 _torch_available = importlib.util.find_spec("torch") is not None
 _onnxruntime_available = importlib.util.find_spec("onnxruntime") is not None
 _is_openvino_available = importlib.util.find_spec("openvino") is not None
-_is_neural_compressor_available = (
-    importlib.util.find_spec("neural_compressor") is not None
-)
+_is_neural_compressor_available = importlib.util.find_spec("neural_compressor") is not None
 
 
 def is_torch_available():
diff --git a/optimum_benchmark/profilers/fx_profiler.py b/optimum_benchmark/profilers/fx_profiler.py
index 4d972abd4..8a72f2eba 100644
--- a/optimum_benchmark/profilers/fx_profiler.py
+++ b/optimum_benchmark/profilers/fx_profiler.py
@@ -1,14 +1,12 @@
-from typing import Any, List, Tuple
-from logging import getLogger
-import torch
 import time
+from logging import getLogger
+from typing import Any, List, Tuple
 
-
-from torch.fx.graph_module import GraphModule
+import torch
 from torch.fx import Interpreter
+from torch.fx.graph_module import GraphModule
 from torch.fx.node import Node
 
-
 LOGGER = getLogger("fx_profiler")
 
 
@@ -18,8 +16,7 @@ def __init__(self, module: GraphModule):
         self.profiling_records: List[Tuple[str, str, float]] = []
 
     def run(self, *args) -> Any:
-        return_val = super().run(*args)
-        return return_val
+        return super().run(*args)
 
     def run_node(self, node: Node) -> Any:
         if self.module.device.type == "cuda":
diff --git a/optimum_benchmark/profilers/ort_profiler.py b/optimum_benchmark/profilers/ort_profiler.py
index d7f555e26..030dda323 100644
--- a/optimum_benchmark/profilers/ort_profiler.py
+++ b/optimum_benchmark/profilers/ort_profiler.py
@@ -1,12 +1,10 @@
-from typing import List, Tuple
-from logging import getLogger
-import pandas as pd
 import json
+from logging import getLogger
+from typing import List, Tuple
 
-
+import pandas as pd
 from optimum.onnxruntime import ORTModel
 
-
 LOGGER = getLogger("ort_profiler")
 
 
@@ -26,9 +24,7 @@ def get_profiling_records(self) -> List[Tuple[str, str, float]]:
                 profiling_data = profiling_data["traceEvents"]
 
         profiling_records = extract_last_run_records(profiling_data)
-        profiling_records = normalize_records(profiling_records)
-
-        return profiling_records
+        return normalize_records(profiling_records)
 
 
 def normalize_records(data) -> List[Tuple[str, str, float]]:
diff --git a/optimum_benchmark/report.py b/optimum_benchmark/report.py
index 9e12d299e..20a2ac286 100644
--- a/optimum_benchmark/report.py
+++ b/optimum_benchmark/report.py
@@ -1,30 +1,27 @@
-import pandas as pd
-import seaborn as sns
+from argparse import ArgumentParser
 from pathlib import Path
-from pandas import DataFrame
+
 import matplotlib.pyplot as plt
-from omegaconf import OmegaConf
+import pandas as pd
+import seaborn as sns
 from flatten_dict import flatten
-from argparse import ArgumentParser
-
-from rich.table import Table
+from omegaconf import OmegaConf
+from pandas import DataFrame
 from rich.console import Console
+from rich.table import Table
 from rich.terminal_theme import MONOKAI
 
 
 def gather_inference_report(root_folder: Path) -> DataFrame:
     # key is path to inference file as string, value is dataframe
     inference_dfs = {
-        f.parent.absolute().as_posix(): pd.read_csv(f)
-        for f in root_folder.glob("**/inference_results.csv")
+        f.parent.absolute().as_posix(): pd.read_csv(f) for f in root_folder.glob("**/inference_results.csv")
     }
 
     # key is path to config file as string, value is flattened dict
     config_dfs = {
         f.parent.absolute()
-        .as_posix(): pd.DataFrame.from_dict(
-            flatten(OmegaConf.load(f), reducer="dot"), orient="index"
-        )
+        .as_posix(): pd.DataFrame.from_dict(flatten(OmegaConf.load(f), reducer="dot"), orient="index")
         .T
         for f in root_folder.glob("**/hydra_config.yaml")
         if f.parent.absolute().as_posix() in inference_dfs.keys()
@@ -35,8 +32,7 @@ def gather_inference_report(root_folder: Path) -> DataFrame:
 
     # Merge inference and config dataframes
     inference_reports = [
-        config_dfs[name].merge(inference_dfs[name], left_index=True, right_index=True)
-        for name in inference_dfs.keys()
+        config_dfs[name].merge(inference_dfs[name], left_index=True, right_index=True) for name in inference_dfs.keys()
     ]
 
     # Concatenate all reports
@@ -82,9 +78,7 @@ def format_row(row, style=""):
     return formated_row
 
 
-def get_inference_rich_table(
-    inference_report, with_baseline=False, with_generate=False, title=""
-):
+def get_inference_rich_table(inference_report, with_baseline=False, with_generate=False, title=""):
     perf_columns = [
         "forward.latency(s)",
         "forward.throughput(samples/s)",
@@ -107,17 +101,12 @@ def get_inference_rich_table(
     additional_columns = [
         col
         for col in inference_report.columns
-        if inference_report[col].nunique() > 1
-        and "backend" in col
-        and "_target_" not in col
-        and "version" not in col
+        if inference_report[col].nunique() > 1 and "backend" in col and "_target_" not in col and "version" not in col
     ]
 
     # display interesting columns in multilevel hierarchy
     display_report = inference_report[additional_columns + perf_columns]
-    display_report.columns = pd.MultiIndex.from_tuples(
-        [tuple(col.split(".")) for col in display_report.columns]
-    )
+    display_report.columns = pd.MultiIndex.from_tuples([tuple(col.split(".")) for col in display_report.columns])
 
     # create rich table
     rich_table = Table(show_header=True, title=title, show_lines=True)
@@ -177,9 +166,7 @@ def get_inference_plots(report, with_baseline=False, with_generate=False, subtit
             ax=ax2,
             width=0.5,
         )
-        ax2.set_xticklabels(
-            ax2.get_xticklabels(), rotation=45, horizontalalignment="right"
-        )
+        ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, horizontalalignment="right")
         ax2.set_xlabel("Experiment")
         ax2.set_ylabel("Generate Throughput (tokens/s)")
         ax2.set_title("Generate Throughput by Experiment" + "\n" + subtitle)
@@ -199,9 +186,7 @@ def get_inference_plots(report, with_baseline=False, with_generate=False, subtit
 
         if with_generate:
             # add speedup text on top of each bar
-            baseline_generate_throughput = report["generate.throughput(tokens/s)"].iloc[
-                -1
-            ]
+            baseline_generate_throughput = report["generate.throughput(tokens/s)"].iloc[-1]
             for p in ax2.patches:
                 speedup = (p.get_height() / baseline_generate_throughput - 1) * 100
                 ax2.annotate(
@@ -210,9 +195,7 @@ def get_inference_plots(report, with_baseline=False, with_generate=False, subtit
                     ha="center",
                     va="center",
                 )
-            ax2.set_title(
-                "Generate Throughput and Speedup by Experiment" + "\n" + subtitle
-            )
+            ax2.set_title("Generate Throughput and Speedup by Experiment" + "\n" + subtitle)
 
     return fig1, fig2
 
@@ -220,16 +203,12 @@ def get_inference_plots(report, with_baseline=False, with_generate=False, subtit
 def compute_speedup(report, with_generate=False):
     # compute speedup for each experiment compared to baseline
     report["forward.speedup(%)"] = (
-        report["forward.throughput(samples/s)"]
-        / report["forward.throughput(samples/s)"].iloc[-1]
-        - 1
+        report["forward.throughput(samples/s)"] / report["forward.throughput(samples/s)"].iloc[-1] - 1
     ) * 100
 
     if with_generate:
         report["generate.speedup(%)"] = (
-            report["generate.throughput(tokens/s)"]
-            / report["generate.throughput(tokens/s)"].iloc[-1]
-            - 1
+            report["generate.throughput(tokens/s)"] / report["generate.throughput(tokens/s)"].iloc[-1] - 1
         ) * 100
 
     return report
@@ -267,15 +246,11 @@ def generate_report():
     report_name = args.report_name
 
     # gather experiments reports
-    inference_experiments = [
-        gather_inference_report(experiment) for experiment in experiments_folders
-    ]
+    inference_experiments = [gather_inference_report(experiment) for experiment in experiments_folders]
     inference_report = pd.concat(inference_experiments, axis=0)
 
     # sort by forward throughput
-    inference_report.sort_values(
-        by="forward.throughput(samples/s)", ascending=False, inplace=True
-    )
+    inference_report.sort_values(by="forward.throughput(samples/s)", ascending=False, inplace=True)
 
     # some flags
     with_baseline = baseline_folder is not None
@@ -284,9 +259,7 @@ def generate_report():
     if with_baseline:
         # gather baseline report
         inference_baseline = gather_inference_report(baseline_folder)
-        assert (
-            inference_baseline.shape[0] == 1
-        ), "baseline folder should contain only one experiment"
+        assert inference_baseline.shape[0] == 1, "baseline folder should contain only one experiment"
         # add baseline to experiment
         inference_report = pd.concat([inference_report, inference_baseline], axis=0)
         # compute speedup compared to baseline
@@ -302,17 +275,13 @@ def generate_report():
     Path(reporting_directory).mkdir(exist_ok=True, parents=True)
 
     # rich table
-    rich_table = get_inference_rich_table(
-        inference_report, with_baseline, with_generate, report_name
-    )
+    rich_table = get_inference_rich_table(inference_report, with_baseline, with_generate, report_name)
     console = Console(record=True)
     console.print(rich_table, justify="left", no_wrap=True)
     console.save_svg(f"{reporting_directory}/rich_table.svg", theme=MONOKAI)
 
     # plots
-    forward_fig, generate_fig = get_inference_plots(
-        inference_report, with_baseline, with_generate, report_name
-    )
+    forward_fig, generate_fig = get_inference_plots(inference_report, with_baseline, with_generate, report_name)
     forward_fig.tight_layout()
     forward_fig.savefig(f"{reporting_directory}/forward_throughput.png")
 
diff --git a/optimum_benchmark/task_utils.py b/optimum_benchmark/task_utils.py
new file mode 100644
index 000000000..de2c95f45
--- /dev/null
+++ b/optimum_benchmark/task_utils.py
@@ -0,0 +1,39 @@
+DIFFUSION_TASKS = [
+    "stable-diffusion",
+    "stable-diffusion-xl",
+]
+
+TEXT_GENERATION_TASKS = [
+    "image-to-text",
+    "text-generation",
+    "text2text-generation",
+    "automatic-speech-recognition",
+]
+
+# let's leave this here for now, it's a good list of tasks supported by transformers
+ALL_TASKS = [
+    "conversational",
+    "feature-extraction",
+    "fill-mask",
+    "text-generation",
+    "text2text-generation",
+    "text-classification",
+    "token-classification",
+    "multiple-choice",
+    "object-detection",
+    "question-answering",
+    "image-classification",
+    "image-segmentation",
+    "mask-generation",
+    "masked-im",
+    "semantic-segmentation",
+    "automatic-speech-recognition",
+    "audio-classification",
+    "audio-frame-classification",
+    "audio-xvector",
+    "image-to-text",
+    "stable-diffusion",
+    "stable-diffusion-xl",
+    "zero-shot-image-classification",
+    "zero-shot-object-detection",
+]
diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py
index 98dc93067..d742a98e1 100644
--- a/optimum_benchmark/trackers/latency.py
+++ b/optimum_benchmark/trackers/latency.py
@@ -1,9 +1,9 @@
+import time
 from contextlib import contextmanager
 from logging import getLogger
 from typing import List
-import torch
-import time
 
+import torch
 
 LOGGER = getLogger("latency_tracker")
 
@@ -59,9 +59,7 @@ def __init__(self, backend):
             self.hf_device_map = None
             self.end_device = self.device
             if self.device.type == "cuda":
-                self.device_indexes = {
-                    self.device.index if self.device.index is not None else 0
-                }
+                self.device_indexes = {self.device.index if self.device.index is not None else 0}
 
     def _cuda_latency(self):
         start_event = torch.cuda.Event(enable_timing=True)
diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py
index c126321a6..b8aa1a199 100644
--- a/optimum_benchmark/trackers/memory.py
+++ b/optimum_benchmark/trackers/memory.py
@@ -1,13 +1,13 @@
-from multiprocessing.connection import Connection
-from multiprocessing import Pipe, Process
+import os
 from contextlib import contextmanager
 from logging import getLogger
+from multiprocessing import Pipe, Process
+from multiprocessing.connection import Connection
+
 import psutil
 import torch
-import os
-
-from optimum_benchmark.utils import bytes_to_mega_bytes
 
+from ..env_utils import bytes_to_mega_bytes
 
 LOGGER = getLogger("memory_tracker")
 
@@ -32,15 +32,13 @@ def _track_cuda_peak_memory(self):
 
         nvml.nvmlInit()
         handle = nvml.nvmlDeviceGetHandleByIndex(
-            self.device.index
-            if self.device.index is not None
-            else torch.cuda.current_device()
+            self.device.index if self.device.index is not None else torch.cuda.current_device()
         )
         yield
         meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
         nvml.nvmlShutdown()
 
-        # At least for PyTorch, relying on meminfo.used is fine 
+        # At least for PyTorch, relying on meminfo.used is fine
         # here as PyTorch does not deallocate its cache after running forward.
         self.peak_memory = max(self.peak_memory, meminfo.used)
         LOGGER.debug(f"Peak memory usage: {self.get_peak_memory()} MB")
@@ -48,9 +46,7 @@ def _track_cuda_peak_memory(self):
     def _track_cpu_peak_memory(self, interval: float):
         child_connection, parent_connection = Pipe()
         # instantiate process
-        mem_process: Process = PeakMemoryMeasureProcess(
-            os.getpid(), child_connection, interval
-        )
+        mem_process: Process = PeakMemoryMeasureProcess(os.getpid(), child_connection, interval)
         mem_process.start()
         # wait until we get memory
         parent_connection.recv()
@@ -76,9 +72,7 @@ def run(self):
 
         while True:
             process = psutil.Process(self.process_id)
-            meminfo_attr = (
-                "memory_info" if hasattr(process, "memory_info") else "get_memory_info"
-            )
+            meminfo_attr = "memory_info" if hasattr(process, "memory_info") else "get_memory_info"
             memory = getattr(process, meminfo_attr)()[0]
             self.mem_usage = max(self.mem_usage, memory)
 
@@ -99,9 +93,7 @@ def __init__(self, backend):
             self.hf_device_map = backend.pretrained_model.hf_device_map
             self.device_indexes = set(self.hf_device_map.values())
         else:
-            self.device_indexes = {
-                self.device.index if self.device.index is not None else 0
-            }
+            self.device_indexes = {self.device.index if self.device.index is not None else 0}
 
         # This variable is used only when CUDA device is used.
         self.peak_per_device = [0 for _ in range(len(self.device_indexes))]
diff --git a/optimum_benchmark/utils.py b/optimum_benchmark/utils.py
deleted file mode 100644
index 001c2f38f..000000000
--- a/optimum_benchmark/utils.py
+++ /dev/null
@@ -1,195 +0,0 @@
-from typing import Optional, List
-from logging import getLogger
-import subprocess
-import platform
-import random
-import signal
-import time
-import re
-import os
-
-import numpy as np
-import psutil
-
-LOGGER = getLogger("utils")
-
-
-def set_seed(seed: int) -> None:
-    random.seed(seed)
-    np.random.seed(seed)
-    os.environ["PYTHONHASHSEED"] = str(seed)
-
-
-def bytes_to_mega_bytes(bytes: int) -> int:
-    # Reference: https://en.wikipedia.org/wiki/Byte#Multiple-byte_units
-    return int(bytes * 1e-6)
-
-
-def get_cpu() -> Optional[str]:
-    if platform.system() == "Windows":
-        return platform.processor()
-
-    elif platform.system() == "Darwin":
-        os.environ["PATH"] = os.environ["PATH"] + os.pathsep + "/usr/sbin"
-        command = "sysctl -n machdep.cpu.brand_string"
-        return str(subprocess.check_output(command).strip())
-
-    elif platform.system() == "Linux":
-        command = "cat /proc/cpuinfo"
-        all_info = subprocess.check_output(command, shell=True).decode().strip()
-        for line in all_info.split("\n"):
-            if "model name" in line:
-                return re.sub(".*model name.*:", "", line, 1)
-        return "Could not find device name"
-
-    else:
-        raise ValueError(f"Unknown system '{platform.system()}'")
-
-
-def get_cpu_ram_mb():
-    return bytes_to_mega_bytes(psutil.virtual_memory().total)
-
-
-def check_no_process_is_running_on_cuda_device(device_ids: List[int]) -> None:
-    """
-    Raises a RuntimeError if any process is running on the given cuda device.
-    """
-
-    for device_id in device_ids:
-        # get list of all PIDs running on nvidia devices
-        pids = [
-            int(pid)
-            for pid in subprocess.check_output(
-                ["nvidia-smi", "--query-compute-apps=pid", "--format=csv,noheader"]
-            )
-            .decode()
-            .strip()
-            .split("\n")
-            if pid != ""
-        ]
-
-        # get list of PIDs running on cuda device_id
-        pids_on_device_id = set(
-            [
-                pid
-                for pid in pids
-                if subprocess.check_output(
-                    [
-                        "nvidia-smi",
-                        "--query-compute-apps=pid,used_memory",
-                        "--format=csv,noheader,nounits",
-                        f"--id={device_id}",
-                    ]
-                )
-                .decode()
-                .startswith(f"{pid},")
-            ]
-        )
-
-        # TODO: It would be safer to run each run of a sweep in a subprocess.
-        # Although we can trust PyTorch to clear GPU memory when asked,
-        # it is not a safe assumption to make for all backends.
-        if len(pids_on_device_id) > 1 or (
-            len(pids_on_device_id) == 1 and os.getpid() not in pids_on_device_id
-        ):
-            raise RuntimeError(
-                f"Expected no processes on device {device_id}, "
-                f"found {len(pids_on_device_id)} processes "
-                f"with PIDs {pids_on_device_id}."
-            )
-
-
-def check_only_this_process_is_running_on_cuda_device(
-    device_ids: List[int], pid
-) -> None:
-    """
-    Raises a RuntimeError if at any point in time, there is a process running
-    on the given cuda device that is not the current process.
-    """
-
-    while True:
-        # get list of all PIDs running on nvidia devices
-        pids = [
-            int(pid)
-            for pid in subprocess.check_output(
-                ["nvidia-smi", "--query-compute-apps=pid", "--format=csv,noheader"]
-            )
-            .decode()
-            .strip()
-            .split("\n")
-            if pid != ""
-        ]
-
-        for device_id in device_ids:
-            # get list of PIDs running on cuda device_id
-            pids_on_device_id = set(
-                [
-                    pid
-                    for pid in pids
-                    if subprocess.check_output(
-                        [
-                            "nvidia-smi",
-                            "--query-compute-apps=pid,used_memory",
-                            "--format=csv,noheader,nounits",
-                            f"--id={device_id}",
-                        ]
-                    )
-                    .decode()
-                    .startswith(f"{pid},")
-                ]
-            )
-
-            # check if there is a process running on
-            # device_id that is not the current process
-            if len(pids_on_device_id) > 1:
-                os.kill(pid, signal.SIGTERM)
-                raise RuntimeError(
-                    f"Expected only process {pid} on device {device_id}, "
-                    f"found {len(pids_on_device_id)} processes "
-                    f"with PIDs {pids_on_device_id}."
-                )
-
-        # sleep for 1 second
-        time.sleep(1)
-
-
-DIFFUSION_TASKS = [
-    "stable-diffusion",
-    "stable-diffusion-xl",
-]
-
-
-TEXT_GENERATION_TASKS = [
-    "text-generation",
-    "text2text-generation",
-    "image-to-text",
-    "automatic-speech-recognition",
-]
-
-# let's leave this here for now, it's a good list of tasks supported by transformers
-ALL_TASKS = [
-    "conversational",
-    "feature-extraction",
-    "fill-mask",
-    "text-generation",
-    "text2text-generation",
-    "text-classification",
-    "token-classification",
-    "multiple-choice",
-    "object-detection",
-    "question-answering",
-    "image-classification",
-    "image-segmentation",
-    "mask-generation",
-    "masked-im",
-    "semantic-segmentation",
-    "automatic-speech-recognition",
-    "audio-classification",
-    "audio-frame-classification",
-    "audio-xvector",
-    "image-to-text",
-    "stable-diffusion",
-    "stable-diffusion-xl",
-    "zero-shot-image-classification",
-    "zero-shot-object-detection",
-]
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 000000000..5f0be540b
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,22 @@
+#  Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+[tool.black]
+line-length = 119
+target-version = ['py37']
+
+[tool.ruff]
+# Never enforce `E501` and `C901` as they are too strict
+ignore = ["E501", "C901"]
+select = ["C", "E", "F", "I", "W"]
diff --git a/requirements.txt b/requirements.txt
index 60f2cf9f2..84e842f5b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,6 +11,7 @@ git+https://github.com/huggingface/diffusers.git
 omegaconf==2.3.0
 hydra-core==1.3.2
 hydra_colorlog==1.2.0
+hydra-joblib-launcher==1.2.0
 
 # system
 py3nvml
diff --git a/setup.py b/setup.py
index ac2095688..cd83b6f79 100644
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,9 @@
-from setuptools import setup, find_packages
-
+from setuptools import find_packages, setup
 
 setup(
     name="optimum-benchmark",
     version="0.0.1",
     packages=find_packages(),
-    # add pytest as for optimum-benchmark[test]
     extras_require={
         "test": ["pytest"],
     },
diff --git a/tests/configs/base_config.yaml b/tests/configs/base_config.yaml
index 517f15c8b..c9806202f 100644
--- a/tests/configs/base_config.yaml
+++ b/tests/configs/base_config.yaml
@@ -1,4 +1,4 @@
-# This is a base config file that can potentially be used for all experiments
+# This is a base config file that can potentially be used for all tests
 defaults:
   - backend: pytorch # default backend
   - benchmark: inference # default benchmark
@@ -6,18 +6,29 @@ defaults:
   - _self_ # for hydra 1.1 compatibility
   - override hydra/job_logging: colorlog # colorful logging
   - override hydra/hydra_logging: colorlog # colorful logging
+  - override hydra/launcher: joblib # joblib launcher
 
 # hydra behavior configuration
 hydra:
   run:
-    dir: runs/${experiment_name} # where to save a run's output
+    # TODO: put the results somewhere after the workflow is done
+    dir: tests/runs/${experiment_name} # where to save a run's output
   sweep:
-    dir: sweeps/${experiment_name} # where to save a sweep's output
+    dir: tests/sweeps/${experiment_name} # where to save a sweep's output
   job:
     # we change the working directory during the run/sweep directory
     # this is useful for saving outputs in a separate directory
     chdir: true
+  launcher:
+    # we set the number of jobs to 2 since when using 1, joblib reuses the same process
+    n_jobs: 2
+    prefer: processes
+    backend: multiprocessing
+  sweeper:
+    # now we force the sweeper to run one job at a time, achieving sequential isolation
+    max_batch_size: 1
 
 backend:
+  # we turn off isolation checks because tests run on shared resources
   initial_isolation_check: false
   continous_isolation_check: false
diff --git a/tests/configs/distributed_cuda_pytorch_inference_gpt2.yaml b/tests/configs/distributed_cuda_pytorch_inference_gpt2.yaml
index c79b910a6..9524d0aec 100644
--- a/tests/configs/distributed_cuda_pytorch_inference_gpt2.yaml
+++ b/tests/configs/distributed_cuda_pytorch_inference_gpt2.yaml
@@ -4,7 +4,9 @@ defaults:
 
 experiment_name: distributed_cuda_pytorch_inference_gpt2
 
+# tiny-gpt2 fails probably because it's just too small to distribute
 model: gpt2
+task: text-generation
 device: cuda
 
 backend:
@@ -14,6 +16,3 @@ hydra:
   job:
     env_set:
       CUDA_VISIBLE_DEVICES: 0,1
-  sweeper:
-    params:
-      benchmark.input_shapes.batch_size: 1,2,4
diff --git a/tests/configs/distributed_cuda_pytorch_training_bert_ddp.yaml b/tests/configs/distributed_cuda_pytorch_training_bert_ddp.yaml
index 2db9b8661..c2ea41614 100644
--- a/tests/configs/distributed_cuda_pytorch_training_bert_ddp.yaml
+++ b/tests/configs/distributed_cuda_pytorch_training_bert_ddp.yaml
@@ -4,16 +4,20 @@ defaults:
   - override benchmark: training
 
 experiment_name: distributed_cuda_pytorch_training_bert_ddp
+
+model: hf-internal-testing/tiny-random-bert
 task: text-classification
-model: bert-base-uncased
 device: cuda
 
 backend:
   use_ddp: true
+  ddp_config:
+    # let's not use the default port to avoid network conflicts
+    rdzv_endpoint: 127.0.0.1:29509
 
 benchmark:
   dataset_shapes:
-    dataset_size: 120
+    dataset_size: 1200
     sequence_length: 256
   training_arguments:
     per_device_train_batch_size: 32
diff --git a/tests/configs/distributed_cuda_pytorch_training_bert_dp.yaml b/tests/configs/distributed_cuda_pytorch_training_bert_dp.yaml
index 71bd5b7e9..a1996f235 100644
--- a/tests/configs/distributed_cuda_pytorch_training_bert_dp.yaml
+++ b/tests/configs/distributed_cuda_pytorch_training_bert_dp.yaml
@@ -5,7 +5,7 @@ defaults:
 
 experiment_name: distributed_cuda_pytorch_training_bert_dp
 
-model: bert-base-uncased
+model: hf-internal-testing/tiny-random-bert
 task: text-classification
 device: cuda
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
index f9d4d39f7..c0a288ad0 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,28 +1,15 @@
 import os
-import pytest
 import subprocess
-from omegaconf import OmegaConf
 
+import pytest
 
 SINGLE_DEVICE_RUNS = [
-    config
-    for config in os.listdir("tests/configs")
-    if config.endswith(".yaml")
-    and config != "base_config.yaml"
-    and "distributed" not in config
-]
-
-DISTRIBUTED_RUNS = [
-    config
-    for config in os.listdir("tests/configs")
-    if config.endswith(".yaml")
-    and config != "base_config.yaml"
-    and "distributed" in config
+    config for config in os.listdir("tests/configs") if config.endswith(".yaml") and config != "base_config.yaml"
 ]
 
 
 @pytest.mark.parametrize("config_file", SINGLE_DEVICE_RUNS)
-def test_single_device_runs(config_file):
+def test_configs(config_file):
     config_name = config_file.split(".")[0]
 
     result = subprocess.run(
@@ -32,31 +19,10 @@ def test_single_device_runs(config_file):
             "tests/configs",
             "--config-name",
             config_name,
+            # "--multirun",
+            # TODO: might be worth removing names from yaml configs and have a list of test models here
         ],
         capture_output=True,
     )
 
     assert result.returncode == 0, result.stderr.decode("utf-8")
-
-
-@pytest.mark.parametrize("config_file", DISTRIBUTED_RUNS)
-def test_distributed_runs(config_file):
-    config_name = config_file.split(".")[0]
-
-    env_set = OmegaConf.load(f"tests/configs/{config_file}")["hydra"]["job"]["env_set"]
-    my_env = os.environ.copy()
-    my_env.update(env_set)
-
-    result = subprocess.run(
-        [
-            "optimum-benchmark",
-            "--config-dir",
-            "tests/configs",
-            "--config-name",
-            config_name,
-        ],
-        capture_output=True,
-        env=my_env,
-    )
-
-    assert result.returncode == 0, result.stderr.decode("utf-8")

From 0b80878ed3053cfd2ce821538def0cb5cd9b6135 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 28 Aug 2023 05:50:57 +0200
Subject: [PATCH 5/8] use to_object for safety and update main_export function

---
 .../backends/neural_compressor/config.py      |   6 +-
 .../backends/onnxruntime/backend.py           |   8 +-
 .../backends/onnxruntime/config.py            |  16 +--
 optimum_benchmark/backends/openvino/config.py |   4 +-
 optimum_benchmark/backends/optimum_utils.py   | 105 +++++++++++++++++-
 optimum_benchmark/backends/pytorch/config.py  |   8 +-
 optimum_benchmark/benchmarks/inference.py     |   4 +-
 optimum_benchmark/benchmarks/training.py      |   2 +-
 optimum_benchmark/experiment.py               |  28 +++--
 9 files changed, 139 insertions(+), 42 deletions(-)

diff --git a/optimum_benchmark/backends/neural_compressor/config.py b/optimum_benchmark/backends/neural_compressor/config.py
index 1a1fcb845..1108a000c 100644
--- a/optimum_benchmark/backends/neural_compressor/config.py
+++ b/optimum_benchmark/backends/neural_compressor/config.py
@@ -76,13 +76,11 @@ class INCConfig(BackendConfig):
 
     def __post_init__(self):
         if self.ptq_quantization:
-            self.ptq_quantization_config = OmegaConf.to_container(
+            self.ptq_quantization_config = OmegaConf.to_object(
                 OmegaConf.merge(PTQ_QUANTIZATION_CONFIG, self.ptq_quantization_config)
             )
             if self.ptq_quantization_config["approach"] == "static" and not self.calibration:
                 raise ValueError("Calibration must be enabled when using static quantization.")
 
         if self.calibration:
-            self.calibration_config = OmegaConf.to_container(
-                OmegaConf.merge(CALIBRATION_CONFIG, self.calibration_config)
-            )
+            self.calibration_config = OmegaConf.to_object(OmegaConf.merge(CALIBRATION_CONFIG, self.calibration_config))
diff --git a/optimum_benchmark/backends/onnxruntime/backend.py b/optimum_benchmark/backends/onnxruntime/backend.py
index a77ad8ad9..79ba0d9fb 100644
--- a/optimum_benchmark/backends/onnxruntime/backend.py
+++ b/optimum_benchmark/backends/onnxruntime/backend.py
@@ -198,10 +198,14 @@ def optimize_onnx_files(self) -> None:
         LOGGER.info("\t+ Processing optimization config")
         if self.config.auto_optimization is not None:
             optimization_config = AutoOptimizationConfig.with_optimization_level(
-                optimization_level=self.config.auto_optimization, **self.config.auto_optimization_config
+                optimization_level=self.config.auto_optimization,
+                for_gpu=self.device.type == "cuda",
+                **self.config.auto_optimization_config,
             )
         elif self.config.optimization:
-            optimization_config = OptimizationConfig(**self.config.optimization_config)
+            optimization_config = OptimizationConfig(
+                optimize_for_gpu=self.device.type == "cuda", **self.config.optimization_config
+            )
         LOGGER.info("\t+ Creating optimizer")
         optimizer = ORTOptimizer.from_pretrained(self.model, file_names=self.onnx_files_names)
         LOGGER.info("\t+ Optimizing ORTModel")
diff --git a/optimum_benchmark/backends/onnxruntime/config.py b/optimum_benchmark/backends/onnxruntime/config.py
index decf5d482..3564a0cbf 100644
--- a/optimum_benchmark/backends/onnxruntime/config.py
+++ b/optimum_benchmark/backends/onnxruntime/config.py
@@ -42,7 +42,6 @@ def onnxruntime_version():
 
 OPTIMIZATION_CONFIG = {
     "optimization_level": 1,  # 0, 1, 2, 99
-    "optimize_for_gpu": "${is_gpu:${device}}",
     "fp16": False,
     "enable_transformers_specific_optimizations": True,
     "enable_gelu_approximation": False,
@@ -64,8 +63,7 @@ def onnxruntime_version():
 }
 
 AUTO_OPTIMIZATION_CONFIG = {
-    "for_gpu": "${is_gpu:${device}}",
-    # full auto optimization config depends on the level so we keep it minimal
+    # auto optimization config depends on the level so we keep it minimal
 }
 
 QUANTIZATION_CONFIG = {
@@ -153,11 +151,11 @@ def __post_init__(self):
             raise NotImplementedError("Can't convert an exported model's weights to a different dtype.")
 
         if self.optimization:
-            self.optimization_config = OmegaConf.to_container(
+            self.optimization_config = OmegaConf.to_object(
                 OmegaConf.merge(OPTIMIZATION_CONFIG, self.optimization_config)
             )
         if self.quantization:
-            self.quantization_config = OmegaConf.to_container(
+            self.quantization_config = OmegaConf.to_object(
                 OmegaConf.merge(QUANTIZATION_CONFIG, self.quantization_config)
             )
             # raise ValueError if the quantization is static but calibration is not enabled
@@ -167,11 +165,11 @@ def __post_init__(self):
                 )
 
         if self.auto_optimization is not None:
-            self.auto_optimization_config = OmegaConf.to_container(
+            self.auto_optimization_config = OmegaConf.to_object(
                 OmegaConf.merge(AUTO_OPTIMIZATION_CONFIG, self.auto_optimization_config)
             )
         if self.auto_quantization is not None:
-            self.auto_quantization_config = OmegaConf.to_container(
+            self.auto_quantization_config = OmegaConf.to_object(
                 OmegaConf.merge(AUTO_QUANTIZATION_CONFIG, self.auto_quantization_config)
             )
             if self.auto_quantization_config["is_static"] and not self.calibration:
@@ -180,6 +178,4 @@ def __post_init__(self):
                 )
 
         if self.calibration:
-            self.calibration_config = OmegaConf.to_container(
-                OmegaConf.merge(CALIBRATION_CONFIG, self.calibration_config)
-            )
+            self.calibration_config = OmegaConf.to_object(OmegaConf.merge(CALIBRATION_CONFIG, self.calibration_config))
diff --git a/optimum_benchmark/backends/openvino/config.py b/optimum_benchmark/backends/openvino/config.py
index e54c2aefd..1f6a49aea 100644
--- a/optimum_benchmark/backends/openvino/config.py
+++ b/optimum_benchmark/backends/openvino/config.py
@@ -53,12 +53,12 @@ class OVConfig(BackendConfig):
 
     def __post_init__(self):
         if self.quantization:
-            self.quantization_config = OmegaConf.to_container(
+            self.quantization_config = OmegaConf.to_object(
                 OmegaConf.merge(QUANTIZATION_CONFIG, self.quantization_config)
             )
             if not self.calibration:
                 raise ValueError("OpenVINO quantization requires enabling calibration.")
             else:
-                self.calibration_config = OmegaConf.to_container(
+                self.calibration_config = OmegaConf.to_object(
                     OmegaConf.merge(CALIBRATION_CONFIG, self.calibration_config)
                 )
diff --git a/optimum_benchmark/backends/optimum_utils.py b/optimum_benchmark/backends/optimum_utils.py
index a064cba08..27cb3da55 100644
--- a/optimum_benchmark/backends/optimum_utils.py
+++ b/optimum_benchmark/backends/optimum_utils.py
@@ -18,6 +18,7 @@
     export_models,
     is_torch_available,
     logger,
+    maybe_load_preprocessors,
     maybe_save_preprocessors,
 )
 
@@ -37,7 +38,7 @@ def main_export(
     fp16: Optional[bool] = False,
     optimize: Optional[str] = None,
     monolith: bool = False,
-    # no_post_process: bool = False,
+    no_post_process: bool = False,
     framework: Optional[str] = None,
     atol: Optional[float] = None,
     cache_dir: Optional[str] = None,
@@ -49,16 +50,101 @@ def main_export(
     local_files_only: bool = False,
     use_auth_token: Optional[Union[bool, str]] = None,
     for_ort: bool = False,
-    # do_validation: bool = True,
+    do_validation: bool = True,
     model_kwargs: Optional[Dict[str, Any]] = None,
     custom_onnx_configs: Optional[Dict[str, "OnnxConfig"]] = None,
     fn_get_submodels: Optional[Callable] = None,
-    # use_subprocess: bool = False,
+    use_subprocess: bool = False,
+    _variant: str = "default",
     ########################################
     model: Optional["PreTrainedModel"] = None,
     ########################################
     **kwargs_shapes,
 ):
+    """
+    Full-suite ONNX export.
+
+    Args:
+        > Required parameters
+
+        model_name_or_path (`str`):
+            Model ID on huggingface.co or path on disk to the model repository to export.
+        output (`Union[str, Path]`):
+            Path indicating the directory where to store the generated ONNX model.
+
+        > Optional parameters
+
+        task (`Optional[str]`, defaults to `None`):
+            The task to export the model for. If not specified, the task will be auto-inferred based on the model. For decoder models,
+            use `xxx-with-past` to export the model using past key values in the decoder.
+        opset (`Optional[int]`, defaults to `None`):
+            If specified, ONNX opset version to export the model with. Otherwise, the default opset for the given model architecture
+            will be used.
+        device (`str`, defaults to `"cpu"`):
+            The device to use to do the export. Defaults to "cpu".
+        fp16 (`Optional[bool]`, defaults to `"False"`):
+            Use half precision during the export. PyTorch-only, requires `device="cuda"`.
+        optimize (`Optional[str]`, defaults to `None`):
+            Allows to run ONNX Runtime optimizations directly during the export. Some of these optimizations are specific to
+            ONNX Runtime, and the resulting ONNX will not be usable with other runtime as OpenVINO or TensorRT.
+            Available options: `"O1", "O2", "O3", "O4"`. Reference: [`~optimum.onnxruntime.AutoOptimizationConfig`]
+        monolith (`bool`, defaults to `False`):
+            Forces to export the model as a single ONNX file.
+        no_post_process (`bool`, defaults to `False`):
+            Allows to disable any post-processing done by default on the exported ONNX models.
+        framework (`Optional[str]`, defaults to `None`):
+            The framework to use for the ONNX export (`"pt"` or `"tf"`). If not provided, will attempt to automatically detect
+            the framework for the checkpoint.
+        atol (`Optional[float]`, defaults to `None`):
+            If specified, the absolute difference tolerance when validating the model. Otherwise, the default atol for the model will be used.
+        cache_dir (`Optional[str]`, defaults to `None`):
+            Path indicating where to store cache. The default Hugging Face cache path will be used by default.
+        trust_remote_code (`bool`, defaults to `False`):
+            Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories
+            you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the
+            model repository.
+        pad_token_id (`Optional[int]`, defaults to `None`):
+            This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it.
+        subfolder (`str`, defaults to `""`):
+            In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can
+            specify the folder name here.
+        revision (`str`, defaults to `"main"`):
+            Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id.
+        force_download (`bool`, defaults to `False`):
+            Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+            cached versions if they exist.
+        local_files_only (`Optional[bool]`, defaults to `False`):
+            Whether or not to only look at local files (i.e., do not try to download the model).
+        use_auth_token (`Optional[str]`, defaults to `None`):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `transformers-cli login` (stored in `~/.huggingface`).
+        model_kwargs (`Optional[Dict[str, Any]]`, defaults to `None`):
+            Experimental usage: keyword arguments to pass to the model during
+            the export. This argument should be used along the `custom_onnx_configs` argument
+            in case, for example, the model inputs/outputs are changed (for example, if
+            `model_kwargs={"output_attentions": True}` is passed).
+        custom_onnx_configs (`Optional[Dict[str, OnnxConfig]]`, defaults to `None`):
+            Experimental usage: override the default ONNX config used for the given model. This argument may be useful for advanced users that desire a finer-grained control on the export. An example is available [here](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model).
+        fn_get_submodels (`Optional[Callable]`, defaults to `None`):
+            Experimental usage: Override the default submodels that are used at the export. This is
+            especially useful when exporting a custom architecture that needs to split the ONNX (e.g. encoder-decoder). If unspecified with custom models, optimum will try to use the default submodels used for the given task, with no guarantee of success.
+        use_subprocess (`bool`):
+            Do the ONNX exported model validation in subprocesses. This is especially useful when
+            exporting on CUDA device, where ORT does not release memory at inference session
+            destruction. When set to `True`, the `main_export` call should be guarded in
+            `if __name__ == "__main__":` block.
+        _variant (`str`, defaults to `default`):
+            Specify the variant of the ONNX export to use.
+        **kwargs_shapes (`Dict`):
+            Shapes to use during inference. This argument allows to override the default shapes used during the ONNX export.
+
+    Example usage:
+    ```python
+    >>> from optimum.exporters.onnx import main_export
+
+    >>> main_export("gpt2", output="gpt2_onnx/")
+    ```
+    """
     if optimize == "O4" and device != "cuda":
         raise ValueError(
             "Requested O4 optimization, but this optimization requires to do the export on GPU."
@@ -180,6 +266,11 @@ def main_export(
             possible_synonyms = ""
         logger.info(f"Automatic task detection to {task}{possible_synonyms}.")
 
+    # The preprocessors are loaded as they may be useful to export the model. Notably, some of the static input shapes may be stored in the
+    # preprocessors config.
+    preprocessors = maybe_load_preprocessors(
+        model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
+    )
     onnx_config, models_and_onnx_configs = _get_submodels_and_onnx_configs(
         model=model,
         task=task,
@@ -187,6 +278,8 @@ def main_export(
         custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {},
         custom_architecture=custom_architecture,
         fn_get_submodels=fn_get_submodels,
+        preprocessors=preprocessors,
+        _variant=_variant,
     )
 
     if not is_stable_diffusion:
@@ -274,6 +367,7 @@ def main_export(
         dtype="fp16" if fp16 is True else None,
         model_kwargs=model_kwargs,
     )
+
     # for the post processing later we don't wanna keep models
     if len(models_and_onnx_configs) == 2:
         models_and_onnx_configs = {
@@ -291,8 +385,7 @@ def main_export(
     return onnx_config, models_and_onnx_configs
 
     # if optimize is not None:
-    #     from optimum.onnxruntime import ORTOptimizer
-    #     from optimum.onnxruntime.configuration import AutoOptimizationConfig
+    #     from ...onnxruntime import AutoOptimizationConfig, ORTOptimizer
 
     #     if onnx_files_subpaths is None:
     #         onnx_files_subpaths = [key + ".onnx" for key in models_and_onnx_configs.keys()]
@@ -308,7 +401,7 @@ def main_export(
     # if not no_post_process and not is_stable_diffusion:
     #     try:
     #         logger.info("Post-processing the exported models...")
-    #         (models_and_onnx_configs, onnx_files_subpaths) = onnx_config.post_process_exported_models(
+    #         models_and_onnx_configs, onnx_files_subpaths = onnx_config.post_process_exported_models(
     #             output, models_and_onnx_configs, onnx_files_subpaths
     #         )
     #     except Exception as e:
diff --git a/optimum_benchmark/backends/pytorch/config.py b/optimum_benchmark/backends/pytorch/config.py
index ab2cc8fa9..ad8884c1b 100644
--- a/optimum_benchmark/backends/pytorch/config.py
+++ b/optimum_benchmark/backends/pytorch/config.py
@@ -105,9 +105,7 @@ def __post_init__(self):
         CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None)
 
         if self.torch_compile:
-            self.torch_compile_config = OmegaConf.to_container(
-                OmegaConf.merge(COMPILE_CONFIG, self.torch_compile_config)
-            )
+            self.torch_compile_config = OmegaConf.to_object(OmegaConf.merge(COMPILE_CONFIG, self.torch_compile_config))
 
         if self.device_map is not None:
             assert CUDA_VISIBLE_DEVICES is not None, "`device_map` can only be used when CUDA_VISIBLE_DEVICES is set."
@@ -129,7 +127,7 @@ def __post_init__(self):
                     f"`quantization_strategy` must be one of {list(QUANTIZATION_CONFIGS.keys())}. Got {self.quantization_strategy} instead."
                 )
             QUANTIZATION_CONFIG = QUANTIZATION_CONFIGS[self.quantization_strategy]
-            self.quantization_config = OmegaConf.to_container(
+            self.quantization_config = OmegaConf.to_object(
                 OmegaConf.merge(QUANTIZATION_CONFIG, self.quantization_config)
             )
 
@@ -137,7 +135,7 @@ def __post_init__(self):
             if CUDA_VISIBLE_DEVICES is None:
                 raise ValueError("`use_ddp` can only be used when CUDA_VISIBLE_DEVICES is set.")
 
-            self.ddp_config = OmegaConf.to_container(OmegaConf.merge(DDP_CONFIG, self.ddp_config), resolve=True)
+            self.ddp_config = OmegaConf.to_object(OmegaConf.merge(DDP_CONFIG, self.ddp_config))
             # TODO: check if it's not possible to use DDP with multiple nodes
             if self.ddp_config["max_nodes"] > 1 or self.ddp_config["min_nodes"] > 1:
                 raise NotImplementedError("Currently, PyTorch DDP benchmark only supports training on a single node.")
diff --git a/optimum_benchmark/benchmarks/inference.py b/optimum_benchmark/benchmarks/inference.py
index eadbc61c5..2b603cf10 100644
--- a/optimum_benchmark/benchmarks/inference.py
+++ b/optimum_benchmark/benchmarks/inference.py
@@ -80,10 +80,10 @@ class InferenceConfig(BenchmarkConfig):
 
     def __post_init__(self):
         if self.can_diffuse:
-            self.forward_kwargs = OmegaConf.to_container(OmegaConf.merge(self.forward_kwargs, DIFUSION_CONFIG))
+            self.forward_kwargs = OmegaConf.to_object(OmegaConf.merge(self.forward_kwargs, DIFUSION_CONFIG))
 
         if self.can_generate:
-            self.generate_kwargs = OmegaConf.to_container(OmegaConf.merge(self.generate_kwargs, GENERATE_CONFIG))
+            self.generate_kwargs = OmegaConf.to_object(OmegaConf.merge(self.generate_kwargs, GENERATE_CONFIG))
 
             if self.generate_kwargs["max_new_tokens"] != self.generate_kwargs["min_new_tokens"]:
                 raise ValueError("`max_new_tokens` and `min_new_tokens` must be equal for fixed length output.")
diff --git a/optimum_benchmark/benchmarks/training.py b/optimum_benchmark/benchmarks/training.py
index 84b0be949..e2bf17cc3 100644
--- a/optimum_benchmark/benchmarks/training.py
+++ b/optimum_benchmark/benchmarks/training.py
@@ -22,7 +22,7 @@ class TrainingConfig(BenchmarkConfig):
     _target_: str = "optimum_benchmark.benchmarks.training.TrainingBenchmark"
 
     # training options
-    warmup_steps: int = 10
+    warmup_steps: int = 40 # still thinks this too high
 
     # dataset options
     dataset_shapes: Dict = field(
diff --git a/optimum_benchmark/experiment.py b/optimum_benchmark/experiment.py
index 85253f80d..3859e0319 100644
--- a/optimum_benchmark/experiment.py
+++ b/optimum_benchmark/experiment.py
@@ -9,7 +9,7 @@
 from diffusers import __version__ as diffusers_version
 from hydra.core.config_store import ConfigStore
 from hydra.utils import get_class
-from omegaconf import DictConfig, OmegaConf, SCMode
+from omegaconf import DictConfig, OmegaConf
 from optimum.exporters import TasksManager
 from optimum.version import __version__ as optimum_version
 from transformers import __version__ as transformers_version
@@ -94,16 +94,12 @@ class ExperimentConfig:
 
 @hydra.main(version_base=None)
 def run_experiment(experiment: DictConfig) -> None:
-    experiment = OmegaConf.to_container(experiment, structured_config_mode=SCMode.INSTANTIATE, resolve=True)
+    # This is required to trigger __post_init__. Reference: https://github.com/omry/omegaconf/issues/377
+    experiment: ExperimentConfig = OmegaConf.to_object(experiment)
 
     # Save the config
     OmegaConf.save(experiment, "hydra_config.yaml", resolve=True)
 
-    # Allocate requested benchmark
-    benchmark_factory: Type[Benchmark] = get_class(experiment.benchmark._target_)
-    benchmark: Benchmark = benchmark_factory()
-    benchmark.configure(experiment.benchmark)
-
     # Allocate requested backend
     backend_factory: Type[Backend] = get_class(experiment.backend._target_)
     backend: Backend = backend_factory(
@@ -112,18 +108,30 @@ def run_experiment(experiment: DictConfig) -> None:
         device=experiment.device,
         hub_kwargs=experiment.hub_kwargs,
     )
-
     try:
         # Configure the backend
         backend.configure(experiment.backend)
+    except Exception as e:
+        LOGGER.error("Error during backend configuration: %s", e)
+        raise e
+
+    # Allocate requested benchmark
+    benchmark_factory: Type[Benchmark] = get_class(experiment.benchmark._target_)
+    benchmark: Benchmark = benchmark_factory()
+    try:
+        benchmark.configure(experiment.benchmark)
+    except Exception as e:
+        LOGGER.error("Error during benchmark configuration: %s", e)
+        raise e
+
+    try:
         # Run the benchmark
         benchmark.run(backend)
         # Save the benchmark results
         benchmark.save()
         # Clean up the backend
         backend.clean()
-
     except Exception as e:
-        LOGGER.error("Error during experiment: %s", e)
+        LOGGER.error("Error during benchmark execution: %s", e)
         backend.clean()
         raise e

From cefbad3ff9b2a9c81422b77611b2c7a2160a1549 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 28 Aug 2023 06:34:20 +0200
Subject: [PATCH 6/8] remove dead code

---
 optimum_benchmark/backends/onnxruntime/config.py | 12 ++++--------
 optimum_benchmark/benchmarks/training.py         |  2 +-
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/optimum_benchmark/backends/onnxruntime/config.py b/optimum_benchmark/backends/onnxruntime/config.py
index 3564a0cbf..c41c4b63d 100644
--- a/optimum_benchmark/backends/onnxruntime/config.py
+++ b/optimum_benchmark/backends/onnxruntime/config.py
@@ -19,10 +19,6 @@ def onnxruntime_version():
             return "ort:unknown"
 
 
-OmegaConf.register_new_resolver(
-    "is_gpu",
-    lambda device: "cuda" in device.lower(),
-)
 OmegaConf.register_new_resolver(
     "is_profiling",
     lambda benchmark_name: benchmark_name == "profiling",
@@ -95,8 +91,6 @@ def onnxruntime_version():
     "preprocess_batch": True,
     "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor",
 }
-PROVIDER_OPTIONS = {"device_id": "${infer_device_id:${device}}"}
-SESSION_OPTIONS = {"enable_profiling": "${is_profiling:${benchmark.name}}"}
 
 
 @dataclass
@@ -116,12 +110,14 @@ class ORTConfig(BackendConfig):
     # provider options
     provider: str = "${infer_provider:${device}}"
     device_id: Optional[int] = "${oc.deprecated:backend.provider_options.device_id}"
-    provider_options: Dict[str, Any] = field(default_factory=lambda: PROVIDER_OPTIONS)
+    provider_options: Dict[str, Any] = field(default_factory=lambda: {"device_id": "${infer_device_id:${device}}"})
 
     # inference options
     use_io_binding: bool = "${is_gpu:${device}}"
     enable_profiling: bool = "${oc.deprecated:backend.session_options.enable_profiling}"
-    session_options: Dict[str, Any] = field(default_factory=lambda: SESSION_OPTIONS)
+    session_options: Dict[str, Any] = field(
+        default_factory=lambda: {"enable_profiling": "${is_profiling:${benchmark.name}}"}
+    )
 
     # optimization options
     optimization: bool = False
diff --git a/optimum_benchmark/benchmarks/training.py b/optimum_benchmark/benchmarks/training.py
index e2bf17cc3..fd8d6e5ed 100644
--- a/optimum_benchmark/benchmarks/training.py
+++ b/optimum_benchmark/benchmarks/training.py
@@ -22,7 +22,7 @@ class TrainingConfig(BenchmarkConfig):
     _target_: str = "optimum_benchmark.benchmarks.training.TrainingBenchmark"
 
     # training options
-    warmup_steps: int = 40 # still thinks this too high
+    warmup_steps: int = 40  # still thinks this too high
 
     # dataset options
     dataset_shapes: Dict = field(

From f914694b636d06706a7e5aa8f0907fa379e40965 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 28 Aug 2023 06:49:26 +0200
Subject: [PATCH 7/8] added error for applying gptq

---
 optimum_benchmark/backends/pytorch/backned.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/optimum_benchmark/backends/pytorch/backned.py b/optimum_benchmark/backends/pytorch/backned.py
index a83482da0..6be3dfec7 100644
--- a/optimum_benchmark/backends/pytorch/backned.py
+++ b/optimum_benchmark/backends/pytorch/backned.py
@@ -9,7 +9,7 @@
 from optimum.bettertransformer import BetterTransformer
 from torch.distributed.elastic.multiprocessing.errors import record
 from torch.distributed.launcher.api import LaunchConfig, elastic_launch
-from transformers import BitsAndBytesConfig, GPTQConfig, Trainer, TrainingArguments
+from transformers import BitsAndBytesConfig, Trainer, TrainingArguments  # GPTQConfig
 from transformers.utils.fx import symbolic_trace
 
 if TYPE_CHECKING:
@@ -96,7 +96,12 @@ def configure(self, config: PyTorchConfig) -> None:
     def load_model_from_pretrained(self) -> None:
         if self.config.quantization_strategy == "gptq":
             LOGGER.info("\t+ Processing GPTQ config")
-            quantization_config = GPTQConfig(**self.config.quantization_config)
+            raise NotImplementedError(
+                "Applying GPTQ quantization on pretrained models is not supported yet. "
+                "If the model is already quantized, you don't need to specify the quantization strategy."
+            )
+            # need to process dataset, tokenizer, etc.
+            # quantization_config = GPTQConfig(**self.config.quantization_config)
         elif self.config.quantization_strategy == "bnb":
             LOGGER.info("\t+ Processing BnB config")
             quantization_config = BitsAndBytesConfig(**self.config.quantization_config)

From 8900373ae7c2e7aa39998f267b30fddd19fcbd59 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 28 Aug 2023 06:51:32 +0200
Subject: [PATCH 8/8] fix

---
 optimum_benchmark/backends/onnxruntime/config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/optimum_benchmark/backends/onnxruntime/config.py b/optimum_benchmark/backends/onnxruntime/config.py
index c41c4b63d..9ae25e927 100644
--- a/optimum_benchmark/backends/onnxruntime/config.py
+++ b/optimum_benchmark/backends/onnxruntime/config.py
@@ -19,6 +19,7 @@ def onnxruntime_version():
             return "ort:unknown"
 
 
+OmegaConf.register_new_resolver("is_gpu", lambda device: "cuda" in device)
 OmegaConf.register_new_resolver(
     "is_profiling",
     lambda benchmark_name: benchmark_name == "profiling",