From 00a1fbe9332f93ea68281726bd3f391ff84bb01f Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 24 Aug 2023 15:34:18 +0200 Subject: [PATCH 1/8] fix ort training test --- docker/ort_training.dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/ort_training.dockerfile b/docker/ort_training.dockerfile index 1efd2c75e..6e07c3a7c 100644 --- a/docker/ort_training.dockerfile +++ b/docker/ort_training.dockerfile @@ -65,5 +65,5 @@ RUN $PYTHON_EXE -m pip install --upgrade protobuf==3.20.2 RUN $PYTHON_EXE -m torch_ort.configure # Install optimum-benchmark dependencies -COPY gpu_requirements.txt /tmp/gpu_requirements.txt -RUN pip install -r /tmp/gpu_requirements.txt \ No newline at end of file +COPY requirements.txt /tmp/requirements.txt +RUN pip install -r /tmp/requirements.txt \ No newline at end of file From ee29f008bddbc542fa8a2491c5ddc85858bb7ed7 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 24 Aug 2023 15:34:50 +0200 Subject: [PATCH 2/8] renaming --- optimum_benchmark/{main.py => experiment.py} | 33 +++++++++----------- setup.py | 2 +- 2 files changed, 16 insertions(+), 19 deletions(-) rename optimum_benchmark/{main.py => experiment.py} (78%) diff --git a/optimum_benchmark/main.py b/optimum_benchmark/experiment.py similarity index 78% rename from optimum_benchmark/main.py rename to optimum_benchmark/experiment.py index f961160b1..a33f1026b 100644 --- a/optimum_benchmark/main.py +++ b/optimum_benchmark/experiment.py @@ -1,6 +1,6 @@ import os import platform -from typing import Type, Dict +from typing import Any, Type, Dict from logging import getLogger from dataclasses import dataclass, MISSING, field @@ -20,19 +20,18 @@ is_openvino_available, is_neural_compressor_available, ) -from optimum_benchmark.backends.base import Backend -from optimum_benchmark.benchmarks.base import Benchmark -from optimum_benchmark.backends.base import Backend, BackendConfig -from optimum_benchmark.benchmarks.training import TrainingConfig -from optimum_benchmark.benchmarks.inference import InferenceConfig -from optimum_benchmark.benchmarks.base import Benchmark, BenchmarkConfig -from .utils import remap_to_correct_metadata, get_cpu, get_cpu_ram_mb +from .backends.base import Backend +from .benchmarks.base import Benchmark +from .utils import get_cpu, get_cpu_ram_mb +from .benchmarks.training import TrainingConfig +from .benchmarks.inference import InferenceConfig -LOGGER = getLogger("main") + +LOGGER = getLogger("experiment") OmegaConf.register_new_resolver( "infer_task", - # TODO: find a better way for this; it doesn't + # TODO: find a better way for this; it doesn't # always work because it relies on hub metadata lambda model, revision: TasksManager.infer_task_from_model( model=model, @@ -44,10 +43,10 @@ @dataclass class ExperimentConfig: # BACKEND CONFIGURATION - backend: BackendConfig = MISSING + backend: Any # https://github.com/facebookresearch/hydra/issues/1722#issuecomment-883568386 # BENCHMARK CONFIGURATION - benchmark: BenchmarkConfig = MISSING + benchmark: Any # https://github.com/facebookresearch/hydra/issues/1722#issuecomment-883568386 # EXPERIMENT CONFIGURATION experiment_name: str = MISSING @@ -114,13 +113,11 @@ class ExperimentConfig: @hydra.main(version_base=None) def run_experiment(experiment: DictConfig) -> None: - # By default, Hydra populates the metadata object_type with the ones from ExperimentConfig but the object_type should really be - # one of the subclass (e.g. PyTorchBackendConfig instead of BackendConfig). This is required to call `to_object`. - experiment = remap_to_correct_metadata(experiment) + from omegaconf import SCMode - # This is required to trigger __post_init__. Reference: https://github.com/omry/omegaconf/issues/377 - experiment = OmegaConf.to_object(experiment) - experiment = OmegaConf.create(experiment) + experiment = OmegaConf.to_container( + experiment, structured_config_mode=SCMode.INSTANTIATE + ) # Save the config OmegaConf.save(experiment, "hydra_config.yaml", resolve=True) diff --git a/setup.py b/setup.py index d473a51a5..ac2095688 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ }, entry_points={ "console_scripts": [ - "optimum-benchmark=optimum_benchmark.main:run_experiment", + "optimum-benchmark=optimum_benchmark.experiment:run_experiment", "optimum-report=optimum_benchmark.report:generate_report", ] }, From 248d205e7417bf45cd7f588ae712dbcbe702e6e3 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 24 Aug 2023 15:38:59 +0200 Subject: [PATCH 3/8] major refactoring --- optimum_benchmark/backends/base.py | 208 ++----- .../backends/neural_compressor.py | 251 ++++---- optimum_benchmark/backends/onnxruntime.py | 464 ++++++++------- optimum_benchmark/backends/openvino.py | 163 +++--- optimum_benchmark/backends/pytorch.py | 548 +++++++++--------- optimum_benchmark/backends/utils/__init__.py | 0 .../backends/utils/base_utils.py | 92 +++ .../backends/utils/neural_compressor_utils.py | 39 ++ .../backends/utils/onnxruntime_utils.py | 94 +++ .../backends/utils/openvino_utils.py | 14 + .../{utils.py => utils/optimum_utils.py} | 288 +-------- .../backends/utils/pytorch_utils.py | 78 +++ optimum_benchmark/benchmarks/base.py | 19 +- optimum_benchmark/benchmarks/inference.py | 185 +++--- .../benchmarks/inference_utils.py | 37 ++ optimum_benchmark/benchmarks/training.py | 230 ++------ .../benchmarks/training_utils.py | 103 ++++ .../generators/dataset_generator.py | 11 +- .../generators/input_generator.py | 8 +- optimum_benchmark/import_utils.py | 10 +- optimum_benchmark/preprocessors/glue.py | 1 - optimum_benchmark/report.py | 10 +- optimum_benchmark/trackers/latency.py | 5 +- optimum_benchmark/trackers/memory.py | 10 +- optimum_benchmark/utils.py | 76 +-- tests/configs/base_config.yaml | 9 - ...ibuted_cuda_pytorch_training_bert_ddp.yaml | 9 +- tests/test_cli.py | 2 - 28 files changed, 1497 insertions(+), 1467 deletions(-) create mode 100644 optimum_benchmark/backends/utils/__init__.py create mode 100644 optimum_benchmark/backends/utils/base_utils.py create mode 100644 optimum_benchmark/backends/utils/neural_compressor_utils.py create mode 100644 optimum_benchmark/backends/utils/onnxruntime_utils.py create mode 100644 optimum_benchmark/backends/utils/openvino_utils.py rename optimum_benchmark/backends/{utils.py => utils/optimum_utils.py} (57%) create mode 100644 optimum_benchmark/backends/utils/pytorch_utils.py create mode 100644 optimum_benchmark/benchmarks/inference_utils.py create mode 100644 optimum_benchmark/benchmarks/training_utils.py diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py index 92c004acc..747e9ac37 100644 --- a/optimum_benchmark/backends/base.py +++ b/optimum_benchmark/backends/base.py @@ -1,33 +1,38 @@ -from typing import Any, Callable, Dict, List, Optional, Union -from dataclasses import dataclass, MISSING +from typing import Any, ClassVar, Dict, List, Optional, Union, TYPE_CHECKING from multiprocessing import Process from abc import abstractmethod, ABC +from dataclasses import dataclass from logging import getLogger -import shutil import os import gc -import torch -from torch import Tensor -from datasets import Dataset +import shutil from psutil import cpu_count -from omegaconf import DictConfig +from diffusers import DiffusionPipeline from optimum.exporters import TasksManager from transformers import ( AutoConfig, AutoProcessor, + ProcessorMixin, PreTrainedModel, - PreTrainedTokenizer, PretrainedConfig, + PreTrainedTokenizer, ImageProcessingMixin, FeatureExtractionMixin, - ProcessorMixin, - Pipeline, ) -from optimum_benchmark.utils import ( +if TYPE_CHECKING: + from transformers.utils import ModelOutput + from transformers import TrainerState + + +from .utils.base_utils import ( + extract_shapes_from_diffusion_pipeline, + extract_shapes_from_model_artifacts, +) +from ..utils import ( DIFFUSION_TASKS, TEXT_GENERATION_TASKS, check_no_process_is_running_on_cuda_device, @@ -35,6 +40,8 @@ ) +LOGGER = getLogger("backend") + PreTrainedProcessor = Union[ PreTrainedTokenizer, ImageProcessingMixin, @@ -42,14 +49,12 @@ ProcessorMixin, ] -LOGGER = getLogger("backend") - @dataclass class BackendConfig(ABC): - name: str = MISSING - version: str = MISSING - _target_: str = MISSING + name: str + version: str + _target_: str # backend options inter_op_num_threads: Optional[int] = None @@ -62,18 +67,28 @@ class BackendConfig(ABC): # clean up options delete_cache: bool = False + def __post_init__(self): + if self.inter_op_num_threads is not None: + if self.inter_op_num_threads == -1: + self.inter_op_num_threads = cpu_count() + + if self.intra_op_num_threads is not None: + if self.intra_op_num_threads == -1: + self.intra_op_num_threads = cpu_count() + class Backend(ABC): - # model and pipeline benchmarks - pretrained_model: Union[PreTrainedModel, Pipeline] - # only for model benchmarks - pretrained_config: Optional[PretrainedConfig] + name: str + config: ClassVar[BackendConfig] + + pretrained_model: Union[PreTrainedModel, DiffusionPipeline] pretrained_processor: Optional[PreTrainedProcessor] + pretrained_config: Optional[PretrainedConfig] - def __init__(self, model: str, task: str, device: str, hub_kwargs: DictConfig): + def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any]): self.model = model self.task = task - self.device = torch.device(device) + self.device = device self.hub_kwargs = hub_kwargs if self.is_diffusion_pipeline(): @@ -97,7 +112,7 @@ def __init__(self, model: str, task: str, device: str, hub_kwargs: DictConfig): **self.hub_kwargs, ) except ValueError: - LOGGER.warning(f"Could not find the model's preprocessor") + LOGGER.warning("Could not find the model's preprocessor") self.pretrained_processor = None # we're using this one as the default model_class which is used @@ -119,7 +134,9 @@ def check_initial_isolation(self) -> None: cuda_devices = os.environ.get("CUDA_VISIBLE_DEVICES") if cuda_devices is None: LOGGER.warning( - "Asked to check the initial device isolation, but the variable CUDA_VISIBLE_DEVICES was not set. Defaulting to checking on the first device." + "Asked to check the initial device isolation, " + "but the variable CUDA_VISIBLE_DEVICES was not set. " + "Defaulting to checking on the first device." ) device_ids = {self.device.index if self.device.index is not None else 0} else: @@ -133,7 +150,9 @@ def check_continuous_isolation(self) -> None: cuda_devices = os.environ.get("CUDA_VISIBLE_DEVICES") if cuda_devices is None: LOGGER.warning( - "Asked to check the continuous device isolation, but the variable CUDA_VISIBLE_DEVICES was not set. Defaulting to checking on the first device." + "Asked to check the continuous device isolation, " + "but the variable CUDA_VISIBLE_DEVICES was not set. " + "Defaulting to checking on the first device." ) device_ids = {self.device.index if self.device.index is not None else 0} else: @@ -150,75 +169,48 @@ def check_continuous_isolation(self) -> None: @abstractmethod def configure(self, config: BackendConfig) -> None: - self.config = config - LOGGER.info(f"Configuring {config.name} backend") - self.config = config - if config.inter_op_num_threads is not None: - if config.inter_op_num_threads == -1: - config.inter_op_num_threads = cpu_count() - LOGGER.info( - f"\t+ Setting backend.inter_op_num_threads to cpu_count({config.inter_op_num_threads})" - ) - - if config.intra_op_num_threads is not None: - if config.intra_op_num_threads == -1: - config.intra_op_num_threads = cpu_count() - LOGGER.info( - f"\t+ Setting backend.intra_op_num_threads to cpu_count({config.intra_op_num_threads})" - ) - - # clean up options - if config.delete_cache: - LOGGER.info("\t+ Will delete model cache after benchmarking") - self.delete_cache = config.delete_cache # isolation options - if config.initial_isolation_check: + if self.config.initial_isolation_check: LOGGER.info("\t+ Checking initial device isolation") self.check_initial_isolation() - if config.continous_isolation_check: + if self.config.continous_isolation_check: LOGGER.info("\t+ Checking contineous device isolation") self.check_continuous_isolation() + # clean up options + if self.config.delete_cache: + LOGGER.info("\t+ Model cache will be deleted after benchmark") + # compiling in openvino requires input shapes - def prepare_for_inference(self, input_shapes: Dict[str, int]) -> None: + def prepare_for_inference(self, input_shapes: Dict[str, int]) -> Dict[str, Any]: pass # symbolic tracing in transformers requires input names - def prepare_for_profiling(self, input_names: List[str]) -> None: - pass - - # depending on the backend, we might need to prepare the model for training - # in different ways although I prefer to pass these in the train method - def prepare_for_training( - self, - training_dataset: Dataset, - training_data_collator: Callable, - training_arguments: Dict[str, Any], - ) -> None: + def prepare_for_profiling(self, input_names: List[str]) -> Dict[str, Any]: pass - def forward(self, input: Dict[str, Tensor], **kwargs): + def forward(self, input: Dict[str, Any], **kwargs) -> "ModelOutput": raise NotImplementedError("Backend must implement forward method") - def generate(self, input: Dict[str, Tensor], **kwargs): + def generate(self, input: Dict[str, Any], **kwargs) -> "ModelOutput": raise NotImplementedError("Backend must implement generate method") - def train(self): + def train(self) -> "TrainerState": raise NotImplementedError("Backend must implement train method") def delete_pretrained_model(self) -> None: - if hasattr(self, "pretrained_model"): + try: del self.pretrained_model - gc.collect() + except AttributeError: + # benchmark might fail before the model is loaded + pass - if self.device.type == "cuda": - torch.cuda.empty_cache() - gc.collect() + gc.collect() - def delete_model_hub_cache(self) -> None: + def delete_model_cache(self) -> None: model_cache_path = "models--" + self.model.replace("/", "--") model_cache_path = os.path.join( os.path.expanduser("~/.cache/huggingface/hub"), model_cache_path @@ -226,11 +218,11 @@ def delete_model_hub_cache(self) -> None: shutil.rmtree(model_cache_path, ignore_errors=True) def clean(self) -> None: - LOGGER.info(f"Cleaning backend") + LOGGER.info(f"Cleaning {self.config.name} backend") self.delete_pretrained_model() - if self.delete_cache: - self.delete_model_hub_cache() + if self.config.delete_cache: + self.delete_model_cache() @property def model_shapes(self) -> Dict[str, int]: @@ -245,75 +237,3 @@ def model_shapes(self) -> Dict[str, int]: ) return model_shapes - - -def extract_shapes_from_diffusion_pipeline( - pipeline: Pipeline, -) -> Dict[str, Any]: - # this is the only way I found to extract a - # diffusion pipeline's "output" shapes - shapes = {} - try: - shapes["num_channels"] = pipeline.vae_encoder.config.out_channels - shapes["height"] = pipeline.vae_encoder.config.sample_size - shapes["width"] = pipeline.vae_encoder.config.sample_size - except AttributeError: - LOGGER.warning("Could not find the diffusion pipeline's output shapes") - shapes["num_channels"] = -1 - shapes["height"] = -1 - shapes["width"] = -1 - - return shapes - - -def extract_shapes_from_model_artifacts( - config: PretrainedConfig, - processor: Optional[PreTrainedProcessor] = None, -) -> Dict[str, Any]: - shapes = {} - artifacts_dict = {} - - config_dict = {k: v for k, v in config.to_dict().items() if v is not None} - artifacts_dict.update(config_dict) - - if processor is not None and hasattr(processor, "to_dict"): - processor_dict = {k: v for k, v in processor.to_dict().items() if v is not None} - artifacts_dict.update(processor_dict) - - # text input - shapes["vocab_size"] = artifacts_dict.get("vocab_size", 2) - shapes["type_vocab_size"] = artifacts_dict.get("type_vocab_size", 2) - - # image input - shapes["num_channels"] = artifacts_dict.get("num_channels", None) - - image_size = artifacts_dict.get("image_size", None) - if image_size is None: - # processors have different names for the image size - image_size = artifacts_dict.get("size", None) - - if isinstance(image_size, (int, float)): - shapes["height"] = image_size - shapes["width"] = image_size - elif isinstance(image_size, (list, tuple)): - shapes["height"] = image_size[0] - shapes["width"] = image_size[0] - elif type(image_size) == dict and len(image_size) == 2: - shapes["height"] = list(image_size.values())[0] - shapes["width"] = list(image_size.values())[1] - elif type(image_size) == dict and len(image_size) == 1: - shapes["height"] = list(image_size.values())[0] - shapes["width"] = list(image_size.values())[0] - else: - shapes["height"] = None - shapes["width"] = None - - # classification labels (default to 2) - shapes["num_labels"] = len( - artifacts_dict.get("id2label", {"0": "LABEL_0", "1": "LABEL_1"}) - ) - - # object detection labels (default to 2) - shapes["num_queries"] = artifacts_dict.get("num_queries", 2) - - return shapes diff --git a/optimum_benchmark/backends/neural_compressor.py b/optimum_benchmark/backends/neural_compressor.py index a22aa5d1e..a1ac95f73 100644 --- a/optimum_benchmark/backends/neural_compressor.py +++ b/optimum_benchmark/backends/neural_compressor.py @@ -1,27 +1,35 @@ -from typing import Dict -from torch import Tensor +from typing import Dict, Optional, Any, TYPE_CHECKING +from tempfile import TemporaryDirectory +from dataclasses import dataclass from logging import getLogger + +import torch +from torch import Tensor from hydra.utils import get_class -from dataclasses import dataclass, field -from tempfile import TemporaryDirectory from omegaconf import DictConfig, OmegaConf +from optimum.intel.neural_compressor.quantization import INCQuantizer +from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS +from neural_compressor import __version__ as neural_compressor_version +from neural_compressor.config import ( + AccuracyCriterion, + TuningCriterion, + PostTrainingQuantConfig, +) -try: - from neural_compressor import __version__ as neural_compressor_version -except ImportError: - neural_compressor_version = "Not installed" - -from optimum_benchmark.backends.base import Backend, BackendConfig - +if TYPE_CHECKING: + from transformers.utils import ModelOutput -OmegaConf.register_new_resolver( - "ptq_is_static", - lambda approach: approach == "static", +from .base import Backend, BackendConfig +from .utils.neural_compressor_utils import ( + DEFAULT_QUANTIZATION_CONFIG, + DEFAULT_CALIBRATION_CONFIG, ) LOGGER = getLogger("neural_compressor") +OmegaConf.register_new_resolver("ptq_is_static", lambda approach: approach == "static") + @dataclass class INCConfig(BackendConfig): @@ -34,67 +42,52 @@ class INCConfig(BackendConfig): # quantization options quantization: bool = False - quantization_config: Dict = field(default_factory=lambda: { - "device": "cpu", - "backend": "default", - "domain": "auto", - "recipes": {}, - "quant_format": "default", - "inputs": [], - "outputs": [], - "approach": "static", - "calibration_sampling_size": [100], - "op_type_dict": None, - "op_name_dict": None, - "reduce_range": None, - "example_inputs": None, - "excluded_precisions": [], - "quant_level": "auto", - "accuracy_criterion": DictConfig( - { - "higher_is_better": True, - "criterion": "relative", - "tolerable_loss": 0.01, - } - ), - "tuning_criterion": DictConfig( - { - "strategy": "basic", - "strategy_kwargs": None, - "timeout": 0, - "max_trials": 100, - "objective": "performance", - } - ), - "diagnosis": False, - } - ) + quantization_config: Optional[Dict[str, Any]] = None # calibration options - calibration: bool = "${ptq_is_static:${backend.quantization_config.approach}}" # type: ignore - calibration_config: Dict = field(default_factory=lambda: { - "dataset_name": "glue", - "num_samples": 300, - "dataset_config_name": "sst2", - "dataset_split": "train", - "preprocess_batch": True, - "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor", - } - ) + calibration: bool = False + calibration_config: Optional[Dict[str, Any]] = None + + def __post_init__(self): + if self.no_weights: + # TODO: implement no_weights for neural_compressor backend if possible + raise NotImplementedError( + "no_weights is not supported for neural_compressor backend" + ) + + if self.quantization: + self.quantization_config = OmegaConf.merge( + self.quantization_config if self.quantization_config else {}, + DEFAULT_QUANTIZATION_CONFIG, + ) + if self.calibration_config["approach"] == "static": + self.calibration = True + + if self.calibration: + self.calibration_config = OmegaConf.merge( + self.calibration_config if self.calibration_config else {}, + DEFAULT_CALIBRATION_CONFIG, + ) class INCBackend(Backend): + name: str = "neural_compressor" + config: INCConfig + def __init__( self, model: str, task: str, device: str, hub_kwargs: DictConfig ) -> None: super().__init__(model, task, device, hub_kwargs) + self.device = torch.device(device) - from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS + assert self.task in _HEAD_TO_AUTOMODELS, ( + f"INCBackend does not support task {self.task} yet. " + f"Supported tasks are: {list(_HEAD_TO_AUTOMODELS.keys())}" + ) self.incmodel_class = get_class( f"optimum.intel.neural_compressor.{_HEAD_TO_AUTOMODELS[self.task]}" ) - LOGGER.info( f"\t+ Infered INCModel class {self.incmodel_class.__name__} " f"for task {self.task} and model_type {self.model_type}" @@ -103,84 +96,100 @@ def __init__( def configure(self, config: INCConfig) -> None: super().configure(config) - with TemporaryDirectory() as tmpdirname: - if config.no_weights: - raise NotImplementedError( - "no_weights is not supported for neural_compressor backend" - ) - else: - self.load_model_from_pretrained(config) - - if config.quantization: - self.quantize_model(config, tmpdirname) - - def load_model_from_pretrained(self, config: INCConfig) -> None: - self.pretrained_model = self.incmodel_class.from_pretrained( - # something is wrong here, modeling is not consistent - model_name_or_path=self.model, - # for some reason only causalLM expects model_id instead of model_name_or_path - **({"model_id": self.model} if self.task == "text-generation" else {}), - device_map=self.device, - **self.hub_kwargs, - ) - - def quantize_model(self, config: INCConfig, tmpdirname: str) -> None: - from optimum.intel.neural_compressor.quantization import INCQuantizer - from neural_compressor.config import ( - AccuracyCriterion, - TuningCriterion, - PostTrainingQuantConfig, - ) + if self.config.quantization: + self.config.quantization_config["accuracy_criterion"] = AccuracyCriterion( + **self.config.quantization_config["accuracy_criterion"] + ) + self.config.quantization_config["tuning_criterion"] = TuningCriterion( + **self.config.quantization_config["tuning_criterion"] + ) + self.quantization_config = PostTrainingQuantConfig( + **self.config.quantization_config + ) - LOGGER.info("\t+ Attempting quantization") + if self.config.calibration: + self.config.calibration_config["preprocess_class"] = get_class( + self.config.calibration_config["preprocess_class"] + ) + self.config.calibration_config[ + "preprocess_function" + ] = self.config.calibration_config["preprocess_class"]( + model_name_or_path=self.model + ) + self.config.calibration_config.pop("preprocess_class") - quantization_config = OmegaConf.to_container(config.quantization_config) - quantization_config["accuracy_criterion"] = AccuracyCriterion( - **config.quantization_config.accuracy_criterion - ) - quantization_config["tuning_criterion"] = TuningCriterion( - **config.quantization_config.tuning_criterion - ) - quantization_config = PostTrainingQuantConfig(**quantization_config) + with TemporaryDirectory() as tmpdirname: + if self.config.quantization: + self.load_and_quantize_automodel(tmpdirname) + else: + self.load_incmodel() + def load_and_quantize_automodel(self, tmpdirname: str) -> None: + LOGGER.info("\t+ Loading pretrained AutoModel") model = self.automodel_class.from_pretrained(self.model, **self.hub_kwargs) - quantizer = INCQuantizer.from_pretrained(model, task=self.task) - - if config.calibration: - preprocess_class = get_class(config.calibration_config.preprocess_class) - preprocess_function = preprocess_class(model_name_or_path=self.model) + LOGGER.info("\t+ Creating quantizer") + quantizer = INCQuantizer.from_pretrained( + model, + eval_fn=None, + calibration_fn=None, + task=self.task, + ) + if self.config.calibration: + LOGGER.info("\t+ Loading calibration dataset") calibration_dataset = quantizer.get_calibration_dataset( - dataset_name=config.calibration_config.dataset_name, - num_samples=config.calibration_config.num_samples, - dataset_config_name=config.calibration_config.dataset_config_name, - dataset_split=config.calibration_config.dataset_split, - preprocess_function=preprocess_function, + **self.config.calibration_config ) + else: + calibration_dataset = None + LOGGER.info("\t+ Attempting quantization") quantizer.quantize( - save_onnx_model=False, - quantization_config=quantization_config, - calibration_dataset=calibration_dataset, + quantization_config=self.config.quantization_config, save_directory=f"{tmpdirname}/quantized", + calibration_dataset=calibration_dataset, + # default values + batch_size=8, + data_collator=None, + remove_unused_columns=True, + file_name=None, ) - self.delete_pretrained_model() - - LOGGER.info("\t+ Loading quantized model") + LOGGER.info("\t+ Loading quantized INCModel") self.pretrained_model = self.incmodel_class.from_pretrained( model_name_or_path=f"{tmpdirname}/quantized", ) - def forward(self, input: Dict[str, Tensor], **kwargs) -> Tensor: - output = self.pretrained_model(**input, **kwargs)[0] + def load_incmodel(self) -> None: + if self.is_diffusion_pipeline(): + self.pretrained_model = self.incmodel_class.from_pretrained( + model_name_or_path=self.model, + **self.hub_kwargs, + ) + self.pretrained_model.to(self.device) + elif self.is_text_generation_model(): + self.pretrained_model = self.incmodel_class.from_pretrained( + # for some reason only causalLM expects + # model_id instead of model_name_or_path + model_id=self.model, + device_map=self.device, + **self.hub_kwargs, + ) + else: + self.pretrained_model = self.incmodel_class.from_pretrained( + # for some reason only causalLM expects + # model_id instead of model_name_or_path + model_name_or_path=self.model, + device_map=self.device, + **self.hub_kwargs, + ) + + def forward(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput": + output = self.pretrained_model(**input, **kwargs) return output - def generate(self, input: Dict[str, Tensor], **kwargs) -> Tensor: - output = self.pretrained_model.generate(**input, **kwargs)[0] + def generate(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput": + output = self.pretrained_model.generate(**input, **kwargs) return output - - def train(self) -> None: - pass diff --git a/optimum_benchmark/backends/onnxruntime.py b/optimum_benchmark/backends/onnxruntime.py index 886fa6e6b..57e811706 100644 --- a/optimum_benchmark/backends/onnxruntime.py +++ b/optimum_benchmark/backends/onnxruntime.py @@ -1,22 +1,19 @@ +from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING +from tempfile import TemporaryDirectory +from dataclasses import dataclass +from logging import getLogger +from datasets import Dataset import os + + import torch from torch import Tensor -from datasets import Dataset -from logging import getLogger from omegaconf import OmegaConf -from dataclasses import dataclass, field from hydra.utils import get_class -from tempfile import TemporaryDirectory -from omegaconf.dictconfig import DictConfig -from typing import Any, Callable, Dict, List, Optional - - -try: - from onnxruntime import __version__ as onnxruntime_version -except ImportError: - onnxruntime_version = "Not installed" - -from optimum.onnxruntime import ORTOptimizer, ORTQuantizer +from onnxruntime import SessionOptions +from accelerate import init_empty_weights +from optimum.pipelines import ORT_SUPPORTED_TASKS +from onnxruntime import __version__ as onnxruntime_version from optimum.onnxruntime.configuration import ( OptimizationConfig, QuantizationConfig, @@ -24,16 +21,34 @@ AutoOptimizationConfig, AutoQuantizationConfig, ) +from optimum.onnxruntime import ( + ORTOptimizer, + ORTQuantizer, + ORTTrainer, + ORTTrainingArguments, +) +if TYPE_CHECKING: + from transformers import TrainerCallback, TrainerState + from transformers.modeling_outputs import ModelOutput + + +from .base import Backend, BackendConfig +from .utils.optimum_utils import main_export +from .utils.pytorch_utils import randomize_weights +from ..profilers.ort_profiler import ORTProfilingWrapper +from .utils.onnxruntime_utils import ( + format_ort_quantization_dict, + infer_device_id, + DEFAULT_OPTIMIZATION_CONFIG, + DEFAULT_QUANTIZATION_CONFIG, + DEFAULT_CALIBRATION_CONFIG, +) -from optimum_benchmark.backends.base import Backend, BackendConfig -from optimum_benchmark.backends.utils import main_export, randomize_weights -from optimum_benchmark.profilers.ort_profiler import ORTProfilingWrapper -from optimum_benchmark.utils import infer_device_id OmegaConf.register_new_resolver( "is_gpu", - lambda device: torch.device(device).type == "cuda", + lambda device: "cuda" in device.lower() or "tensorrt" in device.lower(), ) OmegaConf.register_new_resolver( "is_profiling", @@ -47,10 +62,6 @@ "infer_device_id", lambda device: infer_device_id(device), ) -OmegaConf.register_new_resolver( - "requires_calibration", - lambda *static_quants: any(static_quants), -) LOGGER = getLogger("onnxruntime") @@ -70,103 +81,109 @@ class ORTConfig(BackendConfig): # provider options provider: str = "${infer_provider:${device}}" + provider_options: Optional[Dict] = None + # TODO: deprecate device_id in favor of provider_options device_id: Optional[int] = "${infer_device_id:${device}}" # inference options use_io_binding: bool = "${is_gpu:${device}}" + session_options: Optional[Dict] = None + # TODO: deprecate enable_profiling in favor of session_options enable_profiling: bool = "${is_profiling:${benchmark.name}}" # optimization options optimization: bool = False - optimization_config: Dict = field( - default_factory=lambda: { - "optimization_level": 1, # 0, 1, 2, 99 - "optimize_for_gpu": "${is_gpu:${device}}", - "fp16": False, - "enable_transformers_specific_optimizations": True, - "enable_gelu_approximation": False, - "disable_gelu_fusion": False, - "disable_layer_norm_fusion": False, - "disable_attention_fusion": False, - "disable_skip_layer_norm_fusion": True, - "disable_bias_skip_layer_norm_fusion": False, - "disable_bias_gelu_fusion": False, - "use_mask_index": False, - "no_attention_mask": False, - "disable_embed_layer_norm_fusion": True, - "disable_shape_inference": False, - "use_multi_head_attention": False, - "enable_gemm_fast_gelu_fusion": False, - "use_raw_attention_mask": False, - "disable_group_norm_fusion": True, - "disable_packed_kv": True, - } - ) + optimization_config: Optional[Dict] = None # O1, O2, O3, O4 auto_optimization: Optional[str] = None - auto_optimization_config: Dict = field( - default_factory=lambda: { - "for_gpu": "${is_gpu:${device}}", - # add auto optimization specific options in config file or cli - # using +backend.auto_optimization_config.option_name: value - } - ) + auto_optimization_config: Optional[Dict] = None # quantization options quantization: bool = False - quantization_config: Dict = field( - default_factory=lambda: { - "is_static": False, - "format": "QOperator", # QOperator, QDQ - "mode": "IntegerOps", # QLinearOps, IntegerOps - "activations_dtype": "QUInt8", # QInt8, QUInt8 - "activations_symmetric": False, - "weights_dtype": "QInt8", # QInt8, QUInt8 - "weights_symmetric": True, - "per_channel": False, - "reduce_range": False, - "operators_to_quantize": [ - "MatMul", - "Add", - ], - } - ) + quantization_config: Optional[Dict] = None # arm64,avx2,avx512,avx512_vnni,tensorrt auto_quantization: Optional[str] = None - auto_quantization_config: Dict = field( - default_factory=lambda: { - "is_static": False - # add auto quantization specific options in config file or cli - # using +backend.auto_quantization_config.option_name: value - } - ) + auto_quantization_config: Optional[Dict] = None # calibration options - calibration: bool = "${requires_calibration:${backend.auto_quantization_config.is_static}, ${backend.quantization_config.is_static}}" - calibration_config: Dict = field( - default_factory=lambda: { - "dataset_name": "glue", - "num_samples": 300, - "dataset_config_name": "sst2", - "dataset_split": "train", - "preprocess_batch": True, - "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor", - } - ) - - # this will skip exporting the model and will use automodel instead + calibration: bool = False + calibration_config: Optional[Dict] = None + + # this will skip exporting the model and will use automodel with trainer use_ortmodel: bool = "${is_inference:${benchmark.name}}" + def __post_init__(self): + if self.optimization: + self.optimization_config = OmegaConf.merge( + self.optimization_config or {}, + DEFAULT_OPTIMIZATION_CONFIG, + ) + + if self.auto_optimization is not None: + self.auto_optimization_config = OmegaConf.merge( + self.auto_optimization_config or {}, + DEFAULT_OPTIMIZATION_CONFIG, + ) + self.auto_optimization_config.pop("optimization_level", None) + self.auto_optimization_config[ + "for_gpu" + ] = self.auto_optimization_config.pop("optimize_for_gpu") + + if self.quantization: + self.quantization_config = OmegaConf.merge( + self.quantization_config or {}, + DEFAULT_QUANTIZATION_CONFIG, + ) + + # auto quantization is needs specific config for each type + # if self.auto_quantization is not None: + # self.auto_quantization_config = OmegaConf.merge( + # self.auto_quantization_config or {}, + # DEFAULT_QUANTIZATION_CONFIG, + # ) + + if self.quantization_config is not None: + self.calibration = self.quantization_config["is_static"] + + if self.auto_quantization_config is not None: + self.calibration = self.auto_quantization_config["is_static"] + + if self.calibration: + self.calibration_config = OmegaConf.merge( + self.calibration_config or {}, + DEFAULT_CALIBRATION_CONFIG, + ) + + if self.device_id is not None: + LOGGER.warning( + "device_id is deprecated, please use provider_options instead" + ) + self.provider_options = OmegaConf.merge( + self.provider_options or {}, + {"device_id": self.device_id}, + ) + + if self.enable_profiling is not None: + LOGGER.warning( + "enable_profiling is deprecated, please use session_options instead" + ) + self.session_options = OmegaConf.merge( + self.session_options or {}, + {"enable_profiling": self.enable_profiling}, + ) + class ORTBackend(Backend): + name: str = "onnxruntime" + config: ORTConfig + def __init__( - self, model: str, task: str, device: str, hub_kwargs: DictConfig + self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any] ) -> None: super().__init__(model, task, device, hub_kwargs) - - from optimum.pipelines import ORT_SUPPORTED_TASKS + self.device = torch.device(device) if self.task == "stable-diffusion": self.ortmodel_class = get_class( @@ -189,60 +206,51 @@ def __init__( def configure(self, config: ORTConfig) -> None: super().configure(config) - import onnxruntime - # session options - self.session_options = onnxruntime.SessionOptions() - if config.intra_op_num_threads is not None: + session_options = SessionOptions() + if self.config.intra_op_num_threads is not None: LOGGER.info( - f"\t+ Setting onnxruntime session intra_op_num_threads({config.intra_op_num_threads})" + f"\t+ Setting intra_op_num_threads({config.intra_op_num_threads})" ) - self.session_options.intra_op_num_threads = config.intra_op_num_threads - if config.inter_op_num_threads is not None: - LOGGER.info( - f"\t+ Setting onnxruntime session inter_op_num_threads({config.inter_op_num_threads})" + self.config.session_options.intra_op_num_threads = ( + self.config.intra_op_num_threads ) - self.session_options.inter_op_num_threads = config.inter_op_num_threads - if config.enable_profiling: - LOGGER.info("\t+ Enabling onnxruntime profiling") - self.session_options.enable_profiling = True - - # provider options - self.provider_options = {} - if config.device_id is not None: + if self.config.inter_op_num_threads is not None: LOGGER.info( - f"\t+ Setting onnxruntime provider device_id({config.device_id})" + f"\t+ Setting inter_op_num_threads({config.inter_op_num_threads})" ) - self.provider_options["device_id"] = config.device_id + self.config.session_options.inter_op_num_threads = ( + self.config.inter_op_num_threads + ) + for key, value in self.config.session_options.items(): + setattr(session_options, key, value) + self.config.session_options = session_options # Set torch dtype - self.torch_dtype = ( - getattr(torch, config.torch_dtype) # in case of torch.dtype - if config.torch_dtype is not None and hasattr(torch, config.torch_dtype) - else config.torch_dtype - ) - LOGGER.info( - f"\t+ Using torch dtype({self.torch_dtype}) for weights loading and export" + self.config.torch_dtype = ( + getattr(torch, self.config.torch_dtype) # in case of torch.dtype + if self.config.torch_dtype is not None + and hasattr(torch, self.config.torch_dtype) + else self.config.torch_dtype ) with TemporaryDirectory() as tmpdirname: - if config.use_ortmodel: - if config.no_weights: - self.load_ortmodel_from_config(config, tmpdirname) + if self.config.use_ortmodel: + if self.config.no_weights: + self.load_ortmodel_from_config(tmpdirname) else: - self.load_ortmodel_from_pretrained(config, tmpdirname) + self.load_ortmodel_from_pretrained(tmpdirname) else: - if config.no_weights: - self.load_automodel_from_config(config) + if self.config.no_weights: + self.load_automodel_from_config() else: - self.load_automodel_from_pretrained(config) + self.load_automodel_from_pretrained() - def load_ortmodel_from_config(self, config: ORTConfig, tmpdirname: str) -> None: - LOGGER.info( - f"\t+ Loading model from config in {config.torch_dtype} on {self.device}" - ) + def load_ortmodel_from_config(self, tmpdirname: str) -> None: + LOGGER.info("\t+ Creating random weights model") + self.load_automodel_from_config() - self.load_automodel_from_config(config) + LOGGER.info("\t+ Exporting model to onnx") main_export( model_name_or_path=self.model, output=f"{tmpdirname}/exported_model", @@ -250,10 +258,9 @@ def load_ortmodel_from_config(self, config: ORTConfig, tmpdirname: str) -> None: # we're using but will add "-with-past" when possible task="auto", device=self.device.type, - fp16=self.torch_dtype == torch.float16, - optimize=config.auto_optimization, - no_post_process=not config.use_merged, - for_ort=True, + fp16=self.config.torch_dtype == torch.float16, + optimize=self.config.auto_optimization, + no_post_process=not self.config.use_merged, do_validation=False, **self.hub_kwargs, # we hijack the model instantiation and use our random weights model @@ -261,17 +268,17 @@ def load_ortmodel_from_config(self, config: ORTConfig, tmpdirname: str) -> None: ) self.delete_pretrained_model() - LOGGER.info("\t+ Loading exported model in onnxruntime") + LOGGER.info("\t+ Loading exported model with ORTModel") self.pretrained_model = self.ortmodel_class.from_pretrained( model_id=f"{tmpdirname}/exported_model", - session_options=self.session_options, - use_io_binding=config.use_io_binding, - provider=config.provider, - provider_options=self.provider_options, + session_options=self.config.session_options, + use_io_binding=self.config.use_io_binding, + provider=self.config.provider, + provider_options=self.config.provider_options, **( { - "use_merged": config.use_merged, - "use_cache": config.use_cache, + "use_merged": self.config.use_merged, + "use_cache": self.config.use_cache, } if self.is_text_generation_model() else {} @@ -280,31 +287,36 @@ def load_ortmodel_from_config(self, config: ORTConfig, tmpdirname: str) -> None: **self.hub_kwargs, ) - if config.optimization: + if self.config.optimization: raise NotImplementedError( - "Only AutoOptimization is supported when loading a model with random weights" + "Only AutoOptimization is supported when " + "loading a model with random weights" ) - if config.quantization or config.auto_quantization is not None: - self.quantize(config, tmpdirname) + if self.config.quantization or self.config.auto_quantization is not None: + self.quantize(tmpdirname) - def load_ortmodel_from_pretrained(self, config: ORTConfig, tmpdirname: str) -> None: - if self.torch_dtype is not None and self.torch_dtype != torch.float32: + def load_ortmodel_from_pretrained(self, tmpdirname: str) -> None: + if ( + self.config.torch_dtype is not None + and self.config.torch_dtype != torch.float32 + ): raise NotImplementedError( - "Loading from pretrained is only supported with torch_dtype float32 for now" + "Loading with ORTModel is only supported " + "with torch_dtype float32 for now" ) self.pretrained_model = self.ortmodel_class.from_pretrained( model_id=self.model, - session_options=self.session_options, - use_io_binding=config.use_io_binding, - provider=config.provider, - provider_options=self.provider_options, - export=config.export, + session_options=self.config.session_options, + use_io_binding=self.config.use_io_binding, + provider=self.config.provider, + provider_options=self.config.provider_options, + export=self.config.export, **( { - "use_merged": config.use_merged, - "use_cache": config.use_cache, + "use_merged": self.config.use_merged, + "use_cache": self.config.use_cache, } if self.is_text_generation_model() else {} @@ -312,28 +324,28 @@ def load_ortmodel_from_pretrained(self, config: ORTConfig, tmpdirname: str) -> N **self.hub_kwargs, ) - if config.optimization or config.auto_optimization is not None: - self.optimize(config, tmpdirname) + if self.config.optimization or self.config.auto_optimization is not None: + self.optimize(tmpdirname) - if config.quantization or config.auto_quantization is not None: - self.quantize(config, tmpdirname) + if self.config.quantization or self.config.auto_quantization is not None: + self.quantize(tmpdirname) - def optimize(self, config: ORTConfig, tmpdirname: str) -> None: - if config.auto_optimization is not None: - LOGGER.info(f"\t+ Using auto optimization {config.auto_optimization}") + def optimize(self, tmpdirname: str) -> None: + if self.config.auto_optimization is not None: + LOGGER.info(f"\t+ Using auto optimization {self.config.auto_optimization}") optimization_dict = OmegaConf.to_container( - config.auto_optimization_config, resolve=True + self.config.auto_optimization_config, resolve=True ) LOGGER.info("\t+ Setting auto optimization parameters:") for key, value in optimization_dict.items(): # type: ignore LOGGER.info(f"\t\t+ {key}: {value}") optimization_config = AutoOptimizationConfig.with_optimization_level( - optimization_level=config.auto_optimization, **optimization_dict + optimization_level=self.config.auto_optimization, **optimization_dict ) else: optimization_dict = OmegaConf.to_container( - config.optimization_config, resolve=True + self.config.optimization_config, resolve=True ) LOGGER.info("\t+ Setting optimization parameters:") for key, value in optimization_dict.items(): # type: ignore @@ -351,32 +363,28 @@ def optimize(self, config: ORTConfig, tmpdirname: str) -> None: LOGGER.info("\t+ Loading optimized model") self.pretrained_model = self.ortmodel_class.from_pretrained( model_id=f"{tmpdirname}/optimized", - session_options=self.session_options, - use_io_binding=config.use_io_binding, - provider=config.provider, - provider_options=self.provider_options, + session_options=self.config.session_options, + use_io_binding=self.config.use_io_binding, + provider=self.config.provider, + provider_options=self.config.provider_options, ) - def quantize(self, config: ORTConfig, tmpdirname: str) -> None: - if config.auto_quantization is not None: - LOGGER.info( - f"\t+ Using auto quantization {config.auto_quantization} and its config" - ) + def quantize(self, tmpdirname: str) -> None: + if self.config.auto_quantization is not None: + LOGGER.info(f"\t+ Using auto quantization {self.config.auto_quantization}") auto_quantization_config_class = getattr( - AutoQuantizationConfig, config.auto_quantization + AutoQuantizationConfig, self.config.auto_quantization ) quantization_dict = OmegaConf.to_container( - config.auto_quantization_config, resolve=True + self.config.auto_quantization_config, resolve=True ) quantization_dict = format_ort_quantization_dict(quantization_dict) quantization_config = auto_quantization_config_class(**quantization_dict) else: - LOGGER.info("\t+ Using manual quantization and its config") - from optimum_benchmark.backends.utils import format_ort_quantization_dict - + LOGGER.info("\t+ Using manual quantization") quantization_dict = OmegaConf.to_container( - config.quantization_config, resolve=True + self.config.quantization_config, resolve=True ) quantization_dict = format_ort_quantization_dict(quantization_dict) quantization_config = QuantizationConfig(**quantization_dict) @@ -388,22 +396,26 @@ def quantize(self, config: ORTConfig, tmpdirname: str) -> None: LOGGER.info(f"\t+ Quantizing {component}") quantizer = ORTQuantizer.from_pretrained(model_dir, file_name=component) - if config.calibration: - preprocess_class = get_class(config.calibration_config.preprocess_class) + if self.config.calibration: + preprocess_class = get_class( + self.config.calibration_config.preprocess_class + ) preprocess_function = preprocess_class(model_name_or_path=self.model) calibration_dataset = quantizer.get_calibration_dataset( - dataset_name=config.calibration_config.dataset_name, - num_samples=config.calibration_config.num_samples, - dataset_config_name=config.calibration_config.dataset_config_name, - dataset_split=config.calibration_config.dataset_split, + dataset_name=self.config.calibration_config.dataset_name, + num_samples=self.config.calibration_config.num_samples, + dataset_config_name=self.config.calibration_config.dataset_config_name, + dataset_split=self.config.calibration_config.dataset_split, preprocess_function=preprocess_function, ) - # Create the calibration configuration containing the parameters related to calibration. + # Create the calibration configuration + # containing the parameters related to calibration. calibration_config = AutoCalibrationConfig.minmax(calibration_dataset) - # Perform the calibration step: computes the activations quantization ranges + # Perform the calibration step: + # computes the activations quantization ranges calibration_tensors_range = quantizer.fit( dataset=calibration_dataset, calibration_config=calibration_config, @@ -420,29 +432,27 @@ def quantize(self, config: ORTConfig, tmpdirname: str) -> None: LOGGER.info("\t+ Loading quantized model") self.pretrained_model = self.ortmodel_class.from_pretrained( model_id=f"{tmpdirname}/quantized", - session_options=self.session_options, - use_io_binding=config.use_io_binding, - provider=config.provider, - provider_options=self.provider_options, + session_options=self.config.session_options, + use_io_binding=self.config.use_io_binding, + provider=self.config.provider, + provider_options=self.config.provider_options, ) - def load_automodel_from_config(self, config: ORTConfig) -> None: - from accelerate import init_empty_weights - + def load_automodel_from_config(self) -> None: with init_empty_weights(): self.pretrained_model = self.automodel_class.from_config( config=self.pretrained_config, - torch_dtype=self.torch_dtype, + torch_dtype=self.config.torch_dtype, trust_remote_code=self.hub_kwargs.get("trust_remote_code", False), ) self.pretrained_model.to_empty(device=self.device) randomize_weights(self.pretrained_model) - def load_automodel_from_pretrained(self, config: ORTConfig) -> None: + def load_automodel_from_pretrained(self) -> None: with self.device: self.pretrained_model = self.automodel_class.from_pretrained( pretrained_model_name_or_path=self.model, - torch_dtype=self.torch_dtype, + torch_dtype=self.config.torch_dtype, **self.hub_kwargs, ) @@ -451,37 +461,45 @@ def prepare_for_profiling(self, input_names: List[str]) -> None: LOGGER.info("\t+ Wrapping model inside profiler") self.pretrained_model = ORTProfilingWrapper(self.pretrained_model) - def prepare_for_training( + def forward(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput": + output = self.pretrained_model(**input, **kwargs) + + return output + + def generate(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput": + output = self.pretrained_model.generate(**input, **kwargs) + return output + + def train( self, - training_dataset: Dataset, - training_data_collator: Callable, + training_dataset: "Dataset", training_arguments: Dict[str, Any], - ) -> None: - LOGGER.info("Preparing model for training") - LOGGER.info("\t+ Wrapping model inside trainer") - - from optimum.onnxruntime import ORTTrainer, ORTTrainingArguments + training_callbacks: List["TrainerCallback"], + training_data_collator: Callable, + ) -> "TrainerState": + LOGGER.info("\t+ Setting dataset format to `torch`.") + training_dataset.set_format( + type="torch", columns=list(training_dataset.features.keys()) + ) + LOGGER.info( + "\t+ Wrapping training arguments with " + "optimum.onnxruntime.ORTTrainingArguments" + ) training_arguments = ORTTrainingArguments(**training_arguments) - self.trainer = ORTTrainer( + + LOGGER.info("\t+ Wrapping model with optimum.onnxruntime.ORTTrainer") + trainer = ORTTrainer( model=self.pretrained_model, args=training_arguments, + callbacks=training_callbacks, train_dataset=training_dataset, data_collator=training_data_collator, - feature=self.task, ) - def forward(self, input: Dict[str, Tensor], **kwargs) -> Tensor: - output = self.pretrained_model(**input, **kwargs)[0] - - return output - - def generate(self, input: Dict[str, Tensor], **kwargs) -> Tensor: - output = self.pretrained_model.generate(**input, **kwargs)[0] - return output - - def train(self) -> None: - LOGGER.info("Training model") - results = self.trainer.train() + LOGGER.info("\t+ Starting training") + trainer.train() + LOGGER.info("\t+ Training finished successfully") + trainer_state = trainer.state - return results + return trainer_state diff --git a/optimum_benchmark/backends/openvino.py b/optimum_benchmark/backends/openvino.py index b60ef7f3a..6e83ed756 100644 --- a/optimum_benchmark/backends/openvino.py +++ b/optimum_benchmark/backends/openvino.py @@ -1,19 +1,28 @@ +from typing import Dict, Optional, Any, TYPE_CHECKING +from tempfile import TemporaryDirectory +from dataclasses import dataclass +from logging import getLogger + + import torch import inspect from torch import Tensor -from logging import getLogger -from omegaconf import DictConfig -from dataclasses import dataclass, field +from omegaconf import OmegaConf from hydra.utils import get_class -from typing import Dict, Optional -from tempfile import TemporaryDirectory +from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS +from openvino.runtime import __version__ as openvino_version +from optimum.intel import OVConfig as OVQuantizationConfig, OVQuantizer + +if TYPE_CHECKING: + from transformers.modeling_outputs import ModelOutput -try: - from openvino.runtime import __version__ as openvino_version -except ImportError: - openvino_version = "Not installed" -from optimum_benchmark.backends.base import Backend, BackendConfig +from .base import Backend, BackendConfig +from .utils.openvino_utils import ( + DEFAULT_QUANTIZATION_CONFIG, + DEFAULT_CALIBRATION_CONFIG, +) + LOGGER = getLogger("openvino") @@ -31,40 +40,45 @@ class OVConfig(BackendConfig): torch_dtype: Optional[str] = None # compiling options - dynamic_shapes: bool = True reshape: bool = False half: bool = False # quantization options quantization: bool = False - quantization_config: Dict = field( - default_factory=lambda: { - "compression": None, - "input_info": None, - "save_onnx_model": False, - } - ) + quantization_config: Optional[Dict[str, Any]] = None # calibration options - calibration_config: Dict = field( - default_factory=lambda: { - "dataset_name": "glue", - "num_samples": 300, - "dataset_config_name": "sst2", - "dataset_split": "train", - "preprocess_batch": True, - "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor", - } - ) + calibration: bool = True + calibration_config: Optional[Dict[str, Any]] = None + + def __post_init__(self): + assert self.torch_dtype is None or self.torch_dtype == "float32", ( + "Only float32 is supported for torch_dtype in openvino backend. " + f"Got {self.torch_dtype}" + ) + + if self.quantization: + self.quantization_config = OmegaConf.merge( + self.quantization_config or {}, + DEFAULT_QUANTIZATION_CONFIG, + ) + + if self.calibration: + self.calibration_config = OmegaConf.merge( + self.calibration_config or {}, + DEFAULT_CALIBRATION_CONFIG, + ) class OVBackend(Backend): + name: str = "openvino" + config: OVConfig + def __init__( - self, model: str, task: str, device: str, hub_kwargs: DictConfig + self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any] ) -> None: super().__init__(model, task, device, hub_kwargs) - - from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS + self.device = torch.device(device) self.ovmodel_class = get_class( f"optimum.intel.openvino.{_HEAD_TO_AUTOMODELS[self.task]}" @@ -79,82 +93,73 @@ def configure(self, config: OVConfig) -> None: super().configure(config) # Set torch dtype - self.torch_dtype = ( - getattr(torch, config.torch_dtype) # in case of torch.dtype - if config.torch_dtype is not None and hasattr(torch, config.torch_dtype) - else None # in case of string or None - ) - LOGGER.info( - f"\t+ Using torch dtype({self.torch_dtype}) for weights loading and export" + self.config.torch_dtype = ( + getattr(torch, self.config.torch_dtype) + if self.config.torch_dtype is not None + else None ) + if self.config.quantization: + self.config.quantization_config = OVQuantizationConfig( + **self.config.quantization_config, + ) + with TemporaryDirectory() as tmpdirname: - if config.no_weights: + if self.config.no_weights: raise NotImplementedError( "no_weights is not supported for openvino backend" ) else: - self.load_model_from_pretrained(config) + self.load_model_from_pretrained() - if config.quantization: - self.quantize(config, tmpdirname) + if self.config.quantization: + self.quantize(tmpdirname) - self.reshape = config.reshape - if self.reshape: - LOGGER.info("\t+ Model input will be reshaped and compiled") - - self.half = config.half - if self.half: - LOGGER.info("\t+ Model will be converted to half precision and compiled") - - def load_model_from_pretrained(self, config: OVConfig) -> None: - if self.torch_dtype is not None and self.torch_dtype != torch.float32: - raise NotImplementedError( - "Loading from pretrained is only supported with torch_dtype float32 for now" - ) + def load_model_from_pretrained(self) -> None: self.pretrained_model = self.ovmodel_class.from_pretrained( model_id=self.model, - use_merged=config.use_merged, - export=config.export, + use_merged=self.config.use_merged, + export=self.config.export, **self.hub_kwargs, ) - def quantize(self, config: OVConfig, tmpdirname: str) -> None: + def quantize(self, tmpdirname: str) -> None: LOGGER.info("\t+ Attempting quantization") - from optimum.intel import OVConfig as OVQuantizationConfig, OVQuantizer - model = self.automodel_class.from_pretrained(self.model, **self.hub_kwargs) quantizer = OVQuantizer.from_pretrained(model) - quantization_config = OVQuantizationConfig( - **config.quantization_config, - ) - preprocess_class = get_class(config.calibration_config.preprocess_class) + preprocess_class = get_class(self.config.calibration_config.preprocess_class) preprocess_function = preprocess_class(model_name_or_path=self.model) calibration_dataset = quantizer.get_calibration_dataset( - dataset_name=config.calibration_config.dataset_name, - num_samples=config.calibration_config.num_samples, - dataset_config_name=config.calibration_config.dataset_config_name, - dataset_split=config.calibration_config.dataset_split, + dataset_name=self.config.calibration_config.dataset_name, + num_samples=self.config.calibration_config.num_samples, + dataset_config_name=self.config.calibration_config.dataset_config_name, + dataset_split=self.config.calibration_config.dataset_split, preprocess_function=preprocess_function, ) quantizer.quantize( - save_directory=f"{tmpdirname}/quantized", - quantization_config=quantization_config, calibration_dataset=calibration_dataset, + save_directory=f"{tmpdirname}/quantized", + quantization_config=self.config.quantization_config, + # defaults + batch_size=1, + data_collator=None, + remove_unused_columns=True, + weights_only=False, ) self.delete_pretrained_model() LOGGER.info("\t+ Loading quantized model") self.pretrained_model = self.ovmodel_class.from_pretrained( model_id=f"{tmpdirname}/quantized", + use_merged=self.config.use_merged, ) def prepare_for_inference(self, input_shapes: Dict[str, int]) -> None: - if self.reshape: + if self.config.reshape: static_shapes = { key: value for key, value in input_shapes.items() @@ -163,21 +168,21 @@ def prepare_for_inference(self, input_shapes: Dict[str, int]) -> None: LOGGER.info(f"\t+ Reshaping model with static shapes: {static_shapes}") self.pretrained_model.reshape(**static_shapes) - if self.half: - LOGGER.info(f"\t+ Converting model to half precision") + if self.config.half: + LOGGER.info("\t+ Converting model to half precision") self.pretrained_model.half() - if self.reshape or self.half: - LOGGER.info(f"\t+ Compiling model") + if self.config.reshape or self.config.half: + LOGGER.info("\t+ Compiling model") self.pretrained_model.compile() - def forward(self, input: Dict[str, Tensor], **kwargs) -> Tensor: - output = self.pretrained_model(**input, **kwargs)[0] + def forward(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput": + output = self.pretrained_model(**input, **kwargs) return output - def generate(self, input: Dict[str, Tensor], **kwargs) -> Tensor: - output = self.pretrained_model.generate(**input, **kwargs)[0] + def generate(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput": + output = self.pretrained_model.generate(**input, **kwargs) return output diff --git a/optimum_benchmark/backends/pytorch.py b/optimum_benchmark/backends/pytorch.py index 72877a5f9..b6c84f181 100644 --- a/optimum_benchmark/backends/pytorch.py +++ b/optimum_benchmark/backends/pytorch.py @@ -1,30 +1,38 @@ from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING -from omegaconf import DictConfig, OmegaConf -from dataclasses import dataclass, field +from dataclasses import dataclass from logging import getLogger -from datasets import Dataset -from torch import Tensor -import torch import os -import time +import gc -from torch.distributed.launcher.api import elastic_launch, LaunchConfig -from torch.distributed.elastic.multiprocessing import Std -import logging.config -from transformers.utils import ModelOutput -from transformers import Trainer, TrainingArguments, TrainerCallback +import torch +from torch import Tensor +from accelerate import init_empty_weights +from omegaconf import DictConfig, OmegaConf +from torch import __version__ as torch_version from transformers.utils.fx import symbolic_trace -from transformers.trainer_utils import TrainOutput +from transformers import Trainer, TrainingArguments from optimum.bettertransformer import BetterTransformer +from transformers import BitsAndBytesConfig, GPTQConfig +from torch.distributed.elastic.multiprocessing.errors import record +from torch.distributed.launcher.api import elastic_launch, LaunchConfig -from optimum_benchmark.backends.base import Backend, BackendConfig -from optimum_benchmark.profilers.fx_profiler import FXProfilingWrapper if TYPE_CHECKING: - from transformers import TrainerState, TrainerControl + from datasets import Dataset + from transformers.utils import ModelOutput + from transformers import TrainerState, TrainerCallback + + +from .base import Backend, BackendConfig +from ..profilers.fx_profiler import FXProfilingWrapper +from .utils.pytorch_utils import ( + DEFAULT_COMPILE_CONFIG, + DEFAULT_DDP_CONFIG, + randomize_weights, + get_worker_logger, +) -WARMUP_STEPS = 40 # bachend logger LOGGER = getLogger("pytorch") @@ -38,32 +46,25 @@ @dataclass class PyTorchConfig(BackendConfig): name: str = "pytorch" - version: str = torch.__version__ + version: str = torch_version _target_: str = "optimum_benchmark.backends.pytorch.PyTorchBackend" # load options no_weights: bool = False - torch_dtype: Optional[str] = None device_map: Optional[str] = None + torch_dtype: Optional[str] = None # quantization options - load_in_8bit: bool = False - load_in_4bit: bool = False + quantization_strategy: Optional[str] = None + quantization_config: Optional[Dict[str, Any]] = None # optimization options bettertransformer: bool = False # compilation options torch_compile: bool = False - torch_compile_config: Dict = field(default_factory=lambda: { - "fullgraph": False, - "dynamic": False, - "backend": "inductor", - "mode": None, - "options": None, - "disable": False, - } - ) + torch_compile_kwargs: Optional[Dict] = None + # amp options amp_autocast: bool = False amp_dtype: Optional[str] = None @@ -72,10 +73,88 @@ class PyTorchConfig(BackendConfig): disable_grad: bool = "${is_inference:${benchmark.name}}" # type: ignore eval_mode: bool = "${is_inference:${benchmark.name}}" # type: ignore + # training options + use_ddp: bool = False + ddp_config: Optional[Dict[str, Any]] = None + + def __post_init__(self): + """ + Here we perform checks and transformations on the config. + But we never modify the types of the config values. + """ + + CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None) + + if self.torch_compile: + self.torch_compile_kwargs = OmegaConf.merge( + self.torch_compile_kwargs + if self.torch_compile_kwargs is not None + else {}, + DEFAULT_COMPILE_CONFIG, + ) + + if self.device_map is not None: + assert self.device_map in ["auto", "sequential"], ( + "`device_map` must be one of ['auto', 'sequential']. " + "are supported in Optimum-Bnechmark. " + f"Got {type(self.device_map)} instead." + ) + assert ( + CUDA_VISIBLE_DEVICES is not None + ), "`device_map` can only be used when CUDA_VISIBLE_DEVICES is set." + + if self.torch_dtype is not None: + assert self.torch_dtype in ["bfloat16", "float16", "float32", "auto"], ( + "`torch_dtype` must be one of ['bfloat16', 'float16', 'float32', " + f"'auto']. Got {self.torch_dtype} instead." + ) + + if self.amp_dtype is not None: + assert self.amp_dtype in ["bfloat16", "float16", "float32"], ( + "`amp_dtype` must be one of ['bfloat16', 'float16', 'float32']. " + f"Got {self.amp_dtype} instead." + ) + + if self.quantization_strategy is not None: + assert self.quantization_strategy in ["bnb", "gptq"], ( + "`quantization_strategy` must be one of ['bnb', 'gptq']. " + f"Got {self.quantization_strategy} instead." + ) + if self.quantization_strategy == "gptq": + bits = self.quantization_config.get("bits", None) + assert bits is not None, ( + "`quantization_config.bits` must be provided " + "when using 'gptq' quantization strategy." + ) + else: + self.quantization_config = None + + if self.use_ddp: + self.ddp_config = OmegaConf.merge( + self.ddp_config if self.ddp_config is not None else {}, + DEFAULT_DDP_CONFIG, + ) + + # TODO: support multi-node training. + assert self.ddp_config.max_nodes == 1, ( + "Currently, PyTorch DDP training benchmark " + "only supports training on a single node." + ) + + assert ( + CUDA_VISIBLE_DEVICES is not None + ), "Pytorch DDP training benchmark requires CUDA_VISIBLE_DEVICES to be set." + else: + self.ddp_config = None + class PyTorchBackend(Backend): + name: str = "pytorch" + config: PyTorchConfig + def __init__(self, model: str, task: str, device: str, hub_kwargs: DictConfig): super().__init__(model, task, device, hub_kwargs) + self.device = torch.device(device) LOGGER.info( f"\t+ Infered AutoModel class {self.automodel_class.__name__} " @@ -86,94 +165,132 @@ def configure(self, config: PyTorchConfig) -> None: super().configure(config) # environment options - if config.inter_op_num_threads is not None: + if self.config.inter_op_num_threads is not None: LOGGER.info( - f"\t+ Setting pytorch inter_op_num_threads({config.inter_op_num_threads}))" + "\t+ Setting pytorch " + f"inter_op_num_threads({self.config.inter_op_num_threads}))" ) - torch.set_num_threads(config.inter_op_num_threads) - - if config.intra_op_num_threads is not None: + torch.set_num_threads(self.config.inter_op_num_threads) + if self.config.intra_op_num_threads is not None: LOGGER.info( - f"\t+ Setting pytorch intra_op_num_threads({config.intra_op_num_threads}))" + "\t+ Setting pytorch " + f"intra_op_num_threads({self.config.intra_op_num_threads}))" ) - torch.set_num_interop_threads(config.intra_op_num_threads) + torch.set_num_interop_threads(self.config.intra_op_num_threads) + + # Load config + if self.config.torch_dtype is not None: + if hasattr(torch, self.config.torch_dtype): + self.config.torch_dtype = getattr(torch, self.config.torch_dtype) - # Disable gradients - if config.disable_grad: + # Inference config + if self.config.disable_grad: LOGGER.info("\t+ Disabling gradients") # everything that comes after this will have its gradients disabled torch.set_grad_enabled(False) - - # Set torch dtype - self.torch_dtype = ( - getattr(torch, config.torch_dtype) # in case of torch.dtype - if config.torch_dtype is not None and hasattr(torch, config.torch_dtype) - else config.torch_dtype # in case of string or None - ) + if self.config.amp_dtype is not None: + if hasattr(torch, self.config.amp_dtype): + self.config.amp_dtype = getattr(torch, self.config.amp_dtype) + + # Quantization config + if self.config.quantization_strategy is not None: + if self.config.quantization_strategy == "gptq": + self.config.quantization_config = GPTQConfig( + **self.config.quantization_config + ) + elif self.config.quantization_strategy == "bnb": + self.config.quantization_config = BitsAndBytesConfig( + **self.config.quantization_config + ) # Load model - if config.no_weights: - self.load_model_from_config(config) + if self.config.no_weights: + self.load_model_from_config() else: - self.load_model_from_pretrained(config) + self.load_model_from_pretrained() # Turn on eval mode - if config.eval_mode and self.task not in [ - "stable-diffusion", - "stable-diffusion-xl", - ]: + if not self.is_diffusion_pipeline() and self.config.eval_mode: LOGGER.info("\t+ Turning on eval mode") self.pretrained_model.eval() - # Turn on better transformer inference - if config.bettertransformer: + # Turn on BetterTransformer optimizations + if self.config.bettertransformer: LOGGER.info("\t+ Using optimum.bettertransformer") - self.pretrained_model = BetterTransformer.transform( # type: ignore - self.pretrained_model, keep_original_model=False + self.pretrained_model = BetterTransformer.transform( + self.pretrained_model, + keep_original_model=False, ) # Compile model - if config.torch_compile: - LOGGER.info("\t+ Using torch.compile on forward pass") - self.pretrained_model.forward = torch.compile( - self.pretrained_model.forward, - **config.torch_compile_config, - ) + if self.config.torch_compile: + if self.is_diffusion_pipeline(): + LOGGER.info() + self.pretrained_model.unet = torch.compile( + self.pretrained_model.unet, + **self.config.torch_compile_kwargs, + ) + else: + LOGGER.info("\t+ Using torch.compile on forward pass") + self.pretrained_model.forward = torch.compile( + self.pretrained_model.forward, + **self.config.torch_compile_kwargs, + ) - # pytorch autocast - if config.amp_autocast: - LOGGER.info( - f"\t+ Enabling Automatic Mixed Precision with dtype: {self.amp_dtype}" - ) - self.amp_autocast = config.amp_autocast - self.amp_dtype = ( - getattr(torch, config.amp_dtype) # in case of torch.dtype - if config.amp_dtype is not None and hasattr(torch, config.amp_dtype) - else None - ) + # DDP config + if self.config.use_ddp: + self.config.ddp_config = LaunchConfig(**self.config.ddp_config) - def load_model_from_config(self, config: PyTorchConfig) -> None: - LOGGER.info( - f"\t+ Loading model from config in dtype : " - f"{config.torch_dtype if config.torch_dtype is not None else 'default'} " - "on meta device" - ) + def load_model_from_pretrained(self) -> None: + LOGGER.info(f"\t+ Loading pretrained model weights on device: {self.device}") + if self.is_diffusion_pipeline(): + self.pretrained_model = self.automodel_class.from_pretrained( + pretrained_model_name_or_path=self.model, + torch_dtype=self.config.torch_dtype, + device_map=self.config.device_map, + **self.hub_kwargs, + ) + if self.config.device_map is None: + # Diffusers does not support device_map being a torch.device, + # thus if not provided we move to device here. + self.pretrained_model.to(self.device) + else: + if self.config.device_map is not None: + self.pretrained_model = self.automodel_class.from_pretrained( + pretrained_model_name_or_path=self.model, + quantization_config=self.config.quantization_config, + torch_dtype=self.config.torch_dtype, + device_map=self.config.device_map, + **self.hub_kwargs, + ) + else: + with self.device: + self.pretrained_model = self.automodel_class.from_pretrained( + pretrained_model_name_or_path=self.model, + quantization_config=self.config.quantization_config, + torch_dtype=self.config.torch_dtype, + **self.hub_kwargs, + ) - from accelerate import init_empty_weights - from optimum_benchmark.backends.utils import ( - randomize_weights, - quantize_dummy_model, - ) + def load_model_from_config(self) -> None: + # TODO: create no_weights tests LOGGER.info("\t+ Initializing empty weights model on device: meta") with init_empty_weights(): self.pretrained_model = self.automodel_class.from_config( config=self.pretrained_config, - torch_dtype=self.torch_dtype, + torch_dtype=self.config.torch_dtype, trust_remote_code=self.hub_kwargs.get("trust_remote_code", False), ) - if config.load_in_8bit or config.load_in_4bit: + if self.config.quantization_strategy is None: + LOGGER.info(f"\t+ Materializing model on device: {self.device}") + self.pretrained_model.to_empty(device=self.device) + + LOGGER.info("\t+ Randomizing model weights") + randomize_weights(self.pretrained_model) + self.pretrained_model.tie_weights() + else: LOGGER.info("\t+ Materializing model on device: cpu") self.pretrained_model.to_empty(device="cpu") @@ -181,84 +298,39 @@ def load_model_from_config(self, config: PyTorchConfig) -> None: randomize_weights(self.pretrained_model) self.pretrained_model.tie_weights() - from accelerate.utils import BnbQuantizationConfig + if self.config.quantization_strategy == "bnb": + quantization_config = BitsAndBytesConfig(**self.quantization_config) + elif self.config.quantization_strategy == "gptq": + raise NotImplementedError( + "GPTQ requires a pretrained model to be loaded. " + "`no_weights` option is not supported with GPTQ." + ) + + from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model + # translating transformers bnb config to accelerate bnb config bnb_quantization_config = BnbQuantizationConfig( - load_in_4bit=config.load_in_4bit, - load_in_8bit=config.load_in_8bit, + load_in_4bit=quantization_config.load_in_4bit, + load_in_8bit=quantization_config.load_in_8bit, + # with dummy_weights, we set this to 0 for reproducibility llm_int8_threshold=0, - torch_dtype=self.torch_dtype, + torch_dtype=self.config.torch_dtype, keep_in_fp32_modules=self.pretrained_model.keep_in_fp32_modules if hasattr(self.pretrained_model, "keep_in_fp32_modules") else None, ) - LOGGER.info("\t+ Quantizing model while on device: cpu") - self.pretrained_model = quantize_dummy_model( + LOGGER.info("\t+ Quantizing model while on cpu and dispatching to device") + self.pretrained_model = load_and_quantize_model( model=self.pretrained_model, bnb_quantization_config=bnb_quantization_config, + device_map=self.config.device_map + if self.config.device_map is not None + else self.device, ) - LOGGER.info(f"\t+ Moving model to device: {self.device}") - self.pretrained_model.to(self.device) - self.pretrained_model.tie_weights() - - else: - LOGGER.info(f"\t+ Materializing model on device: {self.device}") - self.pretrained_model.to_empty(device=self.device) - - LOGGER.info("\t+ Randomizing model weights") - randomize_weights(self.pretrained_model) - self.pretrained_model.tie_weights() - - def load_model_from_pretrained(self, config: PyTorchConfig) -> None: - LOGGER.info( - f"\t+ Loading pretrained model weights in dtype: {config.torch_dtype} on device: {self.device}" - ) - if self.task not in ["stable-diffusion", "stable-diffusion-xl"]: - kwargs = {} - if config.load_in_8bit: - kwargs["load_in_8bit"] = config.load_in_8bit - kwargs["llm_int8_threshold"] = 0 - elif config.load_in_4bit: - kwargs["load_in_4bit"] = config.load_in_4bit - - if config.device_map: - kwargs["device_map"] = config.device_map if config.device_map is not None else self.device - - self.pretrained_model = self.automodel_class.from_pretrained( - pretrained_model_name_or_path=self.model, - torch_dtype=self.torch_dtype, - **kwargs, - **self.hub_kwargs, - ) - else: - # When a device_map is not specified, we do not rely on accelerate to load the load and rather try PyTorch-native context. - with self.device: - self.pretrained_model = self.automodel_class.from_pretrained( - pretrained_model_name_or_path=self.model, - torch_dtype=self.torch_dtype, - **kwargs, - **self.hub_kwargs, - ) - else: - self.pretrained_model = self.automodel_class.from_pretrained( - pretrained_model_name_or_path=self.model, - torch_dtype=self.torch_dtype, - device_map=config.device_map, - **self.hub_kwargs, - ) - if config.device_map is None: - # Diffusers does not support device_map being a torch.device, thus if not provided, move to device here. - self.pretrained_model.to(self.device) - - def prepare_for_profiling( - self, - input_names: List[str], - input_shapes: Dict[str, int], - ) -> None: + def prepare_for_profiling(self, input_names: List[str]) -> None: LOGGER.info("Preparing model for profiling") - LOGGER.info("\t+ Symbolicly tracing model") self.pretrained_model = symbolic_trace( model=self.pretrained_model, @@ -268,128 +340,75 @@ def prepare_for_profiling( LOGGER.info("\t+ Wrapping model with FXProfilingWrapper") self.pretrained_model = FXProfilingWrapper(self.pretrained_model) - def forward(self, input: Dict[str, Tensor], **kwargs) -> ModelOutput: + def forward(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput": with torch.autocast( + enabled=self.config.amp_autocast, device_type=self.device.type, - dtype=self.amp_dtype, - enabled=self.amp_autocast, + dtype=self.config.amp_dtype, ): output = self.pretrained_model(**input, **kwargs) return output - def generate(self, input: Dict[str, Tensor], **kwargs) -> ModelOutput: + def generate(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput": with torch.autocast( + enabled=self.config.amp_autocast, device_type=self.device.type, - dtype=self.amp_dtype, - enabled=self.amp_autocast, + dtype=self.config.amp_dtype, ): output = self.pretrained_model.generate(**input, **kwargs) return output - def train(self) -> None: - raise Exception("For PyTorch backend training, please call backend.run_pytorch_training.") - - def run_pytorch_training(self, training_config, training_arguments, training_dataset, training_data_collator): - LOGGER.info("Running training benchmark") - - # Converting from DictConfig to Dict is required to avoid a warning with DDP: - # `[W CudaIPCTypes.cpp:15] Producer process has been terminated before all shared CUDA tensors released. See Note [Sharing CUDA tensors]` - training_arguments_dict = OmegaConf.to_container(training_arguments, resolve=True) - - if training_config.use_ddp: - # TODO: support multi-node training. Hydra is probably not the good infra for that though. - if training_config.ddp_config.max_nodes != 1: - raise ValueError("PyTorch DDP training benchmark currently supports only training on a single node.") - - launch_config = LaunchConfig(**training_config.ddp_config) - LOGGER.info(f"PyTorch DDP launch config: {launch_config}") - - # TODO: The backend instance can not be passed here (cannot pickle 'weakref' object) so the nn.Module is passed directly. - # It is not clear who is using weakref though. + @record + def train( + self, + training_dataset: "Dataset", + training_arguments: Dict[str, Any], + training_callbacks: List["TrainerCallback"], + training_data_collator: Callable, + ) -> "TrainerState": + args = ( + self.config.use_ddp, + self.pretrained_model, + training_dataset, + training_arguments, + training_callbacks, + training_data_collator, + ) + + if self.config.use_ddp: + # For DDP, we log only the stats from the first rank as transformers does. + # It could make sense to log for all ranks. results = elastic_launch( - config=launch_config, - entrypoint=ddp_callable, - )((self.pretrained_model, training_dataset, training_arguments_dict, training_data_collator, True)) - - # For DDP, we log only the stats from the first rank as transformers does. It could make sense to log for all ranks. - results = results[0] + config=self.config.ddp_config, + entrypoint=training_worker, + )(args)[0] else: - # For simple Data Parallel, we can still use ddp_callable, simply not wrapped by the elastic_launch class. - results = ddp_callable((self.pretrained_model, training_dataset, training_arguments_dict, training_data_collator, False)) - + # For DP, we can still use training_worker, + # simply not wrapped by the elastic_launch class. + results = training_worker(args) + return results + def clean(self) -> None: + super().clean() -def get_logger(name: Optional[str] = None, log_all: bool = False): - """ - PyTorch DDP subprocesses do not inherit from Hydra logger. Thus, we need to reconfigure the logger for the workers. - """ - if os.environ["RANK"] == "0" or log_all: - # TODO: also configure logging for other ranks - hydra_conf = OmegaConf.load('.hydra/hydra.yaml') - logging.config.dictConfig(OmegaConf.to_container(hydra_conf.hydra.job_logging, resolve=True)) - return getLogger(name) - -# Adapted from transformers.trainer_utils.speed_metrics -def speed_metrics(trainer): - """ - Measure and return speed performance metrics. - """ - # Reference: https://github.com/huggingface/transformers/blob/v4.31.0/src/transformers/trainer.py#L1559 - total_train_batch_size = trainer._train_batch_size * trainer.args.gradient_accumulation_steps * trainer.args.world_size - result = {} - - # Warmup metrics. - num_warmup_steps = WARMUP_STEPS - num_warmup_samples = num_warmup_steps * total_train_batch_size - warmup_runtime = trainer.state.warmup_end - trainer.state.warmup_start - - warmup_samples_per_second = num_warmup_samples / warmup_runtime - result["warmup_runtime"] = warmup_runtime - result["warmup_samples_per_second"] = round(warmup_samples_per_second, 3) - warmup_steps_per_second = num_warmup_steps / warmup_runtime - result["warmup_steps_per_second"] = round(warmup_steps_per_second, 3) - - # Training metrics. - num_train_steps = trainer.state.max_steps - WARMUP_STEPS - num_train_samples = num_train_steps * total_train_batch_size - train_runtime = trainer.state.training_end - trainer.state.training_start - - train_samples_per_second = num_train_samples / train_runtime - result["train_runtime"] = train_runtime - result["train_samples_per_second"] = round(train_samples_per_second, 3) - train_steps_per_second = num_train_steps / train_runtime - result["train_steps_per_second"] = round(train_steps_per_second, 3) - - return result - -class MeasurementCallback(TrainerCallback): - def on_step_begin(self, args: TrainingArguments, state: "TrainerState", control: "TrainerControl", **kwargs): - if state.global_step == 0: - # This check is here because max_steps is set only once the training is launched, thus we can not check before calling trainer.train(). - if state.max_steps <= WARMUP_STEPS: - raise ValueError(f"Total training steps {state.max_steps} is smaller than the number of warmup steps {WARMUP_STEPS}. Please increase the total number of steps (for example by increasing the dataset size).") - - state.warmup_start = time.time_ns() * 1e-9 - elif state.global_step == WARMUP_STEPS: - state.warmup_end = time.time_ns() * 1e-9 - state.training_start = time.time_ns() * 1e-9 - elif state.global_step == state.max_steps - 1: - state.training_end = time.time_ns() * 1e-9 - elif state.global_step > state.max_steps - 1: - raise ValueError("global_step > state.max_steps - 1") - -def ddp_callable(args): - pretrained_model = args[0] - training_dataset = args[1] - training_arguments = args[2] - training_data_collator = args[3] - use_ddp = args[4] + if self.device.type == "cuda": + torch.cuda.empty_cache() + gc.collect() + + +def training_worker(args) -> "TrainerState": + use_ddp = args[0] + pretrained_model = args[1] + training_dataset = args[2] + training_arguments = args[3] + training_callbacks = args[4] + training_data_collator = args[5] if use_ddp: - LOGGER_WORKER = get_logger("training-ddp-worker", log_all=False) + LOGGER_WORKER = get_worker_logger("pytorch-ddp-worker", log_all=False) env_variables = [ "RANK", @@ -398,24 +417,35 @@ def ddp_callable(args): "MASTER_PORT", "TORCHELASTIC_MAX_RESTARTS", ] + + LOGGER_WORKER.info("Initializing DDP worker") for env_var in env_variables: LOGGER_WORKER.info(f"{env_var}: {os.environ.get(env_var)}") else: LOGGER_WORKER = LOGGER - LOGGER_WORKER.info("\t+ Wrapping model with transformers.Trainer") + LOGGER_WORKER.info("\t+ Setting dataset format to `torch`.") + training_dataset.set_format( + type="torch", columns=list(training_dataset.features.keys()) + ) + + LOGGER_WORKER.info( + "\t+ Wrapping training arguments with transformers.TrainingArguments" + ) training_arguments = TrainingArguments(**training_arguments) + LOGGER_WORKER.info("\t+ Wrapping model with transformers.Trainer") trainer = Trainer( model=pretrained_model, + args=training_arguments, + callbacks=training_callbacks, train_dataset=training_dataset, data_collator=training_data_collator, - args=training_arguments, - callbacks=[MeasurementCallback] ) - - LOGGER_WORKER.info("Training model") + + LOGGER_WORKER.info("\t+ Starting training") trainer.train() - results = speed_metrics(trainer) + LOGGER_WORKER.info("\t+ Training finished successfully") + trainer_state = trainer.state - return results + return trainer_state diff --git a/optimum_benchmark/backends/utils/__init__.py b/optimum_benchmark/backends/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/optimum_benchmark/backends/utils/base_utils.py b/optimum_benchmark/backends/utils/base_utils.py new file mode 100644 index 000000000..7f357be9d --- /dev/null +++ b/optimum_benchmark/backends/utils/base_utils.py @@ -0,0 +1,92 @@ +from typing import Any, Dict, Optional, Union + +from diffusers import DiffusionPipeline +from transformers import ( + ProcessorMixin, + PretrainedConfig, + PreTrainedTokenizer, + ImageProcessingMixin, + FeatureExtractionMixin, +) + + +PreTrainedProcessor = Union[ + PreTrainedTokenizer, + ImageProcessingMixin, + FeatureExtractionMixin, + ProcessorMixin, +] + + +def extract_shapes_from_diffusion_pipeline( + pipeline: DiffusionPipeline, +) -> Dict[str, Any]: + # this is the only way I found to extract a diffusion pipeline's "input" shapes + shapes = {} + if hasattr(pipeline, "vae_encoder") and hasattr(pipeline.vae_encoder, "config"): + shapes["num_channels"] = pipeline.vae_encoder.config["out_channels"] + shapes["height"] = pipeline.vae_encoder.config["sample_size"] + shapes["width"] = pipeline.vae_encoder.config["sample_size"] + elif hasattr(pipeline, "vae") and hasattr(pipeline.vae, "config"): + shapes["num_channels"] = pipeline.vae.config.out_channels + shapes["height"] = pipeline.vae.config.sample_size + shapes["width"] = pipeline.vae.config.sample_size + else: + shapes["num_channels"] = -1 + shapes["height"] = -1 + shapes["width"] = -1 + + return shapes + + +def extract_shapes_from_model_artifacts( + config: PretrainedConfig, + processor: Optional[PreTrainedProcessor] = None, +) -> Dict[str, Any]: + shapes = {} + artifacts_dict = {} + + config_dict = {k: v for k, v in config.to_dict().items() if v is not None} + artifacts_dict.update(config_dict) + + if processor is not None and hasattr(processor, "to_dict"): + processor_dict = {k: v for k, v in processor.to_dict().items() if v is not None} + artifacts_dict.update(processor_dict) + + # text input + shapes["vocab_size"] = artifacts_dict.get("vocab_size", 2) + shapes["type_vocab_size"] = artifacts_dict.get("type_vocab_size", 2) + + # image input + shapes["num_channels"] = artifacts_dict.get("num_channels", None) + + image_size = artifacts_dict.get("image_size", None) + if image_size is None: + # processors have different names for the image size + image_size = artifacts_dict.get("size", None) + + if isinstance(image_size, (int, float)): + shapes["height"] = image_size + shapes["width"] = image_size + elif isinstance(image_size, (list, tuple)): + shapes["height"] = image_size[0] + shapes["width"] = image_size[0] + elif isinstance(image_size, dict) and len(image_size) == 2: + shapes["height"] = list(image_size.values())[0] + shapes["width"] = list(image_size.values())[1] + elif isinstance(image_size, dict) and len(image_size) == 1: + shapes["height"] = list(image_size.values())[0] + shapes["width"] = list(image_size.values())[0] + else: + shapes["height"] = None + shapes["width"] = None + + # classification labels (default to 2) + shapes["num_labels"] = len( + artifacts_dict.get("id2label", {"0": "LABEL_0", "1": "LABEL_1"}) + ) + + # object detection labels (default to 2) + shapes["num_queries"] = artifacts_dict.get("num_queries", 2) + + return shapes diff --git a/optimum_benchmark/backends/utils/neural_compressor_utils.py b/optimum_benchmark/backends/utils/neural_compressor_utils.py new file mode 100644 index 000000000..96632df48 --- /dev/null +++ b/optimum_benchmark/backends/utils/neural_compressor_utils.py @@ -0,0 +1,39 @@ +DEFAULT_QUANTIZATION_CONFIG = { + "device": "cpu", + "backend": "default", + "domain": "auto", + "recipes": {}, + "quant_format": "default", + "inputs": [], + "outputs": [], + "approach": "static", + "calibration_sampling_size": [100], + "op_type_dict": None, + "op_name_dict": None, + "reduce_range": None, + "example_inputs": None, + "excluded_precisions": [], + "quant_level": "auto", + "accuracy_criterion": { + "higher_is_better": True, + "criterion": "relative", + "tolerable_loss": 0.01, + }, + "tuning_criterion": { + "strategy": "basic", + "strategy_kwargs": None, + "timeout": 0, + "max_trials": 100, + "objective": "performance", + }, + "diagnosis": False, +} + +DEFAULT_CALIBRATION_CONFIG = { + "dataset_name": "glue", + "num_samples": 300, + "dataset_config_name": "sst2", + "dataset_split": "train", + "preprocess_batch": True, + "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor", +} diff --git a/optimum_benchmark/backends/utils/onnxruntime_utils.py b/optimum_benchmark/backends/utils/onnxruntime_utils.py new file mode 100644 index 000000000..65568458a --- /dev/null +++ b/optimum_benchmark/backends/utils/onnxruntime_utils.py @@ -0,0 +1,94 @@ +from typing import Any, Dict + + +DEFAULT_OPTIMIZATION_CONFIG = { + "optimization_level": 1, # 0, 1, 2, 99 + "optimize_for_gpu": "${is_gpu:${device}}", + "fp16": False, + "enable_transformers_specific_optimizations": True, + "enable_gelu_approximation": False, + "disable_gelu_fusion": False, + "disable_layer_norm_fusion": False, + "disable_attention_fusion": False, + "disable_skip_layer_norm_fusion": True, + "disable_bias_skip_layer_norm_fusion": False, + "disable_bias_gelu_fusion": False, + "use_mask_index": False, + "no_attention_mask": False, + "disable_embed_layer_norm_fusion": True, + "disable_shape_inference": False, + "use_multi_head_attention": False, + "enable_gemm_fast_gelu_fusion": False, + "use_raw_attention_mask": False, + "disable_group_norm_fusion": True, + "disable_packed_kv": True, +} + +DEFAULT_QUANTIZATION_CONFIG = { + "is_static": False, + "format": "QOperator", # QOperator, QDQ + "mode": "IntegerOps", # QLinearOps, IntegerOps + "activations_dtype": "QUInt8", # QInt8, QUInt8 + "activations_symmetric": False, + "weights_dtype": "QInt8", # QInt8, QUInt8 + "weights_symmetric": True, + "per_channel": False, + "reduce_range": False, + "operators_to_quantize": [ + "MatMul", + "Add", + ], +} + +DEFAULT_CALIBRATION_CONFIG = { + "dataset_name": "glue", + "num_samples": 300, + "dataset_config_name": "sst2", + "dataset_split": "train", + "preprocess_batch": True, + "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor", +} + + +def infer_device_id(device: str) -> int: + """ + Infer the device id from the given device string. + """ + + import torch + + if device == "cuda": + return torch.cuda.current_device() + elif torch.device(device).type == "cuda": + return torch.device(device).index + elif torch.device(device).type == "cpu": + return -1 + else: + raise ValueError(f"Unknown device '{device}'") + + +def format_ort_quantization_dict(quantization_dict: Dict[str, Any]) -> None: + """ + Format the quantization dictionary for onnxruntime. + """ + + from onnxruntime.quantization import QuantFormat, QuantizationMode, QuantType + + if quantization_dict.get("format", None) is not None: + quantization_dict["format"] = QuantFormat.from_string( + quantization_dict["format"] + ) + if quantization_dict.get("mode", None) is not None: + quantization_dict["mode"] = QuantizationMode.from_string( + quantization_dict["mode"] + ) + if quantization_dict.get("activations_dtype", None) is not None: + quantization_dict["activations_dtype"] = QuantType.from_string( + quantization_dict["activations_dtype"] + ) + if quantization_dict.get("weights_dtype", None) is not None: + quantization_dict["weights_dtype"] = QuantType.from_string( + quantization_dict["weights_dtype"] + ) + + return quantization_dict diff --git a/optimum_benchmark/backends/utils/openvino_utils.py b/optimum_benchmark/backends/utils/openvino_utils.py new file mode 100644 index 000000000..0f1037b77 --- /dev/null +++ b/optimum_benchmark/backends/utils/openvino_utils.py @@ -0,0 +1,14 @@ +DEFAULT_QUANTIZATION_CONFIG = { + "compression": None, + "input_info": None, + "save_onnx_model": False, +} + +DEFAULT_CALIBRATION_CONFIG = { + "dataset_name": "glue", + "num_samples": 300, + "dataset_config_name": "sst2", + "dataset_split": "train", + "preprocess_batch": True, + "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor", +} diff --git a/optimum_benchmark/backends/utils.py b/optimum_benchmark/backends/utils/optimum_utils.py similarity index 57% rename from optimum_benchmark/backends/utils.py rename to optimum_benchmark/backends/utils/optimum_utils.py index 991d35f29..a558f1659 100644 --- a/optimum_benchmark/backends/utils.py +++ b/optimum_benchmark/backends/utils/optimum_utils.py @@ -1,277 +1,35 @@ -from typing import Any, Callable, Dict, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union from pathlib import Path import os import torch -from optimum.exporters import TasksManager -from optimum.onnxruntime import ORTOptimizer -from optimum.utils import DEFAULT_DUMMY_SHAPES -from transformers.utils import is_torch_available -from optimum.exporters.onnx.base import OnnxConfig -from optimum.utils.save_utils import maybe_save_preprocessors -from optimum.exporters.onnx.constants import UNPICKABLE_ARCHS -from optimum.utils import DEFAULT_DUMMY_SHAPES, ONNX_WEIGHTS_NAME -from optimum.onnxruntime.configuration import AutoOptimizationConfig -from transformers import AutoTokenizer, PreTrainedModel, PretrainedConfig -from requests.exceptions import ConnectionError as RequestsConnectionError -from optimum.exporters.error_utils import AtolError, OutputMatchError, ShapeError -from optimum.exporters.onnx.convert import export_models, validate_models_outputs -from optimum.exporters.onnx.__main__ import logger, _get_submodels_and_onnx_configs -from optimum.exporters.onnx import ( - get_encoder_decoder_models_for_export, - get_decoder_models_for_export, +from optimum.exporters.onnx.__main__ import ( + logger, + TasksManager, OnnxConfigWithPast, + _get_submodels_and_onnx_configs, + maybe_save_preprocessors, + validate_models_outputs, + is_torch_available, export_models, + AutoTokenizer, + DEFAULT_DUMMY_SHAPES, + ONNX_WEIGHTS_NAME, + UNPICKABLE_ARCHS, + RequestsConnectionError, + OutputMatchError, + ShapeError, + AtolError, ) -def randomize_weights(model): - for param in model.parameters(): - if torch.cuda.is_available() and param.device.type == "cpu": - # we take advantage of the fact that a cuda device - # is available to use cuda kernels for randomization - # this is slower than asynchronous randomization while - # model is fully on gpu (because of data transfer) but - # faster than randomization while model is on cpu - param.data.cuda().normal_(mean=0.0, std=0.2).cpu() - else: - param.data.normal_(mean=0.0, std=0.2) - - -def format_ort_quantization_dict(quantization_dict: Dict[str, Any]) -> None: - from onnxruntime.quantization import ( - QuantFormat, - QuantizationMode, - QuantType, - ) - - if quantization_dict.get("format", None) is not None: - quantization_dict["format"] = QuantFormat.from_string( - quantization_dict["format"] - ) - if quantization_dict.get("mode", None) is not None: - quantization_dict["mode"] = QuantizationMode.from_string( - quantization_dict["mode"] - ) - if quantization_dict.get("activations_dtype", None) is not None: - quantization_dict["activations_dtype"] = QuantType.from_string( - quantization_dict["activations_dtype"] - ) - if quantization_dict.get("weights_dtype", None) is not None: - quantization_dict["weights_dtype"] = QuantType.from_string( - quantization_dict["weights_dtype"] - ) - - return quantization_dict - - -def quantize_dummy_model( - model, - bnb_quantization_config, -): - from accelerate.utils.bnb import ( - get_keys_to_not_convert, - replace_with_bnb_layers, - logger, - ) - - # We keep some modules such as the lm_head in their original dtype for numerical stability reasons - if bnb_quantization_config.skip_modules is None: - bnb_quantization_config.skip_modules = get_keys_to_not_convert(model) - - # add cpu modules to skip modules only for 4-bit modules - modules_to_not_convert = bnb_quantization_config.skip_modules - - # We add the modules we want to keep in full precision - if bnb_quantization_config.keep_in_fp32_modules is None: - bnb_quantization_config.keep_in_fp32_modules = [] - keep_in_fp32_modules = bnb_quantization_config.keep_in_fp32_modules - modules_to_not_convert.extend(keep_in_fp32_modules) - - # compatibility with peft - model.is_loaded_in_4bit = bnb_quantization_config.load_in_4bit - model.is_loaded_in_8bit = bnb_quantization_config.load_in_8bit - - # quantization of an already loaded model - logger.warning( - "It is not recommended to quantize a loaded model. " - "The model should be instantiated under the `init_empty_weights` context manager." - ) - model = replace_with_bnb_layers( - model, bnb_quantization_config, modules_to_not_convert=modules_to_not_convert - ) - # convert param to the right dtype - dtype = bnb_quantization_config.torch_dtype - for name, param in model.state_dict().items(): - if any( - module_to_keep_in_fp32 in name - for module_to_keep_in_fp32 in keep_in_fp32_modules - ): - param.to(torch.float32) - if param.dtype != torch.float32: - name = name.replace(".weight", "").replace(".bias", "") - param = getattr(model, name, None) - if param is not None: - param.to(torch.float32) - elif torch.is_floating_point(param): - param.to(dtype) - - return model - - -def export_dummy_model( - automodel_class, - pretrained_config: PretrainedConfig, - output_dir: str, - device: torch.device, - torch_dtype: Optional[torch.dtype] = None, - auto_optimization: Optional[str] = None, - use_merged: Optional[bool] = None, - **cache_kwargs, -): - ######################################## - from accelerate import init_empty_weights - - with init_empty_weights(): - model = automodel_class.from_config( - config=pretrained_config, - torch_dtype=torch_dtype, - trust_remote_code=cache_kwargs.get("trust_remote_code", False), - ) - model.to_empty(device=device) - randomize_weights(model) - ######################################## - - input_shapes = {} - original_task = "auto" - output_path = Path(output_dir) - - for input_name in DEFAULT_DUMMY_SHAPES.keys(): - input_shapes[input_name] = DEFAULT_DUMMY_SHAPES[input_name] - - try: - task = TasksManager.infer_task_from_model(model) - except KeyError as e: - raise KeyError( - f"The task could not be automatically inferred. Please provide the argument --task with the task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" - ) - - if task + "-with-past" in TasksManager.get_supported_tasks_for_model_type( - model.config.model_type.replace("_", "-"), "onnx" - ): - if ( - original_task == "auto" - ): # Make -with-past the default if --task was not explicitely specified - task = task + "-with-past" - - onnx_config_constructor = TasksManager.get_exporter_config_constructor( - model=model, exporter="onnx", task=task - ) - onnx_config = onnx_config_constructor(model.config) - - needs_pad_token_id = ( - isinstance(onnx_config, OnnxConfigWithPast) - and getattr(model.config, "pad_token_id", None) is None - and task in ["text-classification"] - ) - if needs_pad_token_id: - try: - tok = AutoTokenizer.from_pretrained(model.name_or_path) - model.config.pad_token_id = tok.pad_token_id - except Exception: - raise ValueError( - "Could not infer the pad token id, which is needed in this case, please provide it with the --pad_token_id argument" - ) - - opset = onnx_config.DEFAULT_ONNX_OPSET - atol = onnx_config.ATOL_FOR_VALIDATION - if isinstance(atol, dict): - atol = atol[task.replace("-with-past", "")] - - # Saving the model config and preprocessor as this is needed sometimes. - model.config.save_pretrained(output_path) - generation_config = getattr(model, "generation_config", None) - if generation_config is not None: - generation_config.save_pretrained(output_path) - - maybe_save_preprocessors(output_path, output_path) - - if model.config.is_encoder_decoder and task.startswith("text-generation"): - raise ValueError( - f"model.config.is_encoder_decoder is True and task is `{task}`, which are incompatible. If the task was auto-inferred, please fill a bug report" - f"at https://github.com/huggingface/optimum, if --task was explicitely passed, make sure you selected the right task for the model," - f" referring to `optimum.exporters.tasks.TaskManager`'s `_TASKS_TO_AUTOMODELS`." - ) - - onnx_files_subpaths = None - if model.config.is_encoder_decoder and task.startswith( - ( - "text2text-generation", - "automatic-speech-recognition", - "image-to-text", - "feature-extraction-with-past", - ) - ): - models_and_onnx_configs = get_encoder_decoder_models_for_export( - model, onnx_config - ) - - elif task.startswith("text-generation"): - models_and_onnx_configs = get_decoder_models_for_export(model, onnx_config) - else: - models_and_onnx_configs = {"model": (model, onnx_config)} - - print("Attempting to export the model to ONNX...") - _, __ = export_models( - models_and_onnx_configs=models_and_onnx_configs, # type: ignore - opset=opset, # type: ignore - output_dir=output_path, - output_names=onnx_files_subpaths, - input_shapes=input_shapes, - device=str(device), - dtype="fp16" if torch_dtype == torch.float16 else None, - ) - print("Model successfully exported to ONNX.") - - if auto_optimization: - print("Attempting to optimize the exported ONNX models...") - if onnx_files_subpaths is None: - onnx_files_subpaths = [ - key + ".onnx" for key in models_and_onnx_configs.keys() - ] - optimizer = ORTOptimizer.from_pretrained( - output_path, file_names=onnx_files_subpaths - ) - - optimization_config = AutoOptimizationConfig.with_optimization_level( - optimization_level=auto_optimization - ) - - optimizer.optimize( - save_dir=output_path, - optimization_config=optimization_config, - file_suffix="", - ) - print("ONNX models successfully optimized.") - - # post process is disabled in optimum ort api so you need to export models with cli - # and then load them with ort api to reproduce the same results - if use_merged: - try: - print("Attempting to merge the exported ONNX models...") - ( - models_and_onnx_configs, - onnx_files_subpaths, - ) = onnx_config.post_process_exported_models( - output_path, models_and_onnx_configs, onnx_files_subpaths - ) - print("ONNX models successfully merged.") - except Exception as e: - raise Exception( - f"The post-processing of the ONNX export failed. The export can still be performed by passing the option --no-post-process. Detailed error: {e}" - ) +if TYPE_CHECKING: + from transformers import PreTrainedModel + from optimum.exporters.onnx import OnnxConfig +# rewrite of the main_export function from optimum.exporters.onnx.__main__ +# to use the model passed in as an argument instead of loading it from the model_name_or_path def main_export( model_name_or_path: str, output: Union[str, Path], @@ -295,11 +53,11 @@ def main_export( for_ort: bool = False, do_validation: bool = True, model_kwargs: Optional[Dict[str, Any]] = None, - custom_onnx_configs: Optional[Dict[str, OnnxConfig]] = None, + custom_onnx_configs: Optional[Dict[str, "OnnxConfig"]] = None, fn_get_submodels: Optional[Callable] = None, use_subprocess: bool = False, ######################################## - model: Optional[PreTrainedModel] = None, + model: Optional["PreTrainedModel"] = None, ######################################## **kwargs_shapes, ): diff --git a/optimum_benchmark/backends/utils/pytorch_utils.py b/optimum_benchmark/backends/utils/pytorch_utils.py new file mode 100644 index 000000000..04a2dbecb --- /dev/null +++ b/optimum_benchmark/backends/utils/pytorch_utils.py @@ -0,0 +1,78 @@ +from logging import getLogger +from typing import Optional +import logging.config +import os + +import torch +from omegaconf import OmegaConf +from torch.distributed.elastic.multiprocessing import Std + +OmegaConf.register_new_resolver("device_count", lambda: torch.cuda.device_count()) + + +DEFAULT_COMPILE_CONFIG = { + "fullgraph": False, + "dynamic": False, + "backend": "inductor", + "mode": None, + "options": None, + "disable": False, +} + +# from https://github.com/pytorch/pytorch/blob/v2.0.0/torch/distributed/launcher/api.py#L29 +# adjusted to the defaults of torch.distributed.run +# defined in https://github.com/pytorch/pytorch/blob/v2.0.0/torch/distributed/run.py#L770 +# TODO: decide wrther to use torch.distributed.run arguments or the ones from +# torch.distributed.launcher.api +DEFAULT_DDP_CONFIG = { + "min_nodes": 1, + "max_nodes": 1, + "run_id": "none", + "nproc_per_node": "${device_count:}", + "role": "default", + "rdzv_endpoint": "127.0.0.1:29500", + "rdzv_backend": "static", + "rdzv_configs": { + "timeout": 900, + "rank": 0, + }, + "max_restarts": 0, + "monitor_interval": 5, + "start_method": "spawn", + "log_dir": None, + "metrics_cfg": {}, + "local_addr": None, + "redirects": Std.NONE, + "tee": Std.NONE, +} + + +def randomize_weights(model): + for param in model.parameters(): + if torch.cuda.is_available() and param.device.type == "cpu": + # we take advantage of the fact that a cuda device + # is available to use cuda kernels for randomization + # this is slower than asynchronous randomization while + # model is fully on gpu (because of data transfer) but + # faster than randomization while model is on cpu + param.data.cuda().normal_(mean=0.0, std=0.2).cpu() + else: + param.data.normal_(mean=0.0, std=0.2) + + +def get_worker_logger( + name: Optional[str] = None, + log_all: bool = False, +) -> logging.Logger: + """ + PyTorch DDP subprocesses do not inherit from Hydra logger. + Thus, we need to reconfigure the logger for the workers. + """ + if os.environ["RANK"] == "0" or log_all: + # TODO: also configure logging for other ranks + hydra_conf = OmegaConf.load(".hydra/hydra.yaml") + logging.config.dictConfig( + OmegaConf.to_container(hydra_conf.hydra.job_logging, resolve=True) + ) + + return getLogger(name) diff --git a/optimum_benchmark/benchmarks/base.py b/optimum_benchmark/benchmarks/base.py index 9b41ce3b7..da2721e5d 100644 --- a/optimum_benchmark/benchmarks/base.py +++ b/optimum_benchmark/benchmarks/base.py @@ -1,9 +1,8 @@ -from dataclasses import dataclass, MISSING +from dataclasses import dataclass from logging import getLogger from abc import ABC from optimum_benchmark.backends.base import Backend -from optimum_benchmark.utils import set_seed LOGGER = getLogger("benchmark") @@ -11,25 +10,23 @@ @dataclass class BenchmarkConfig(ABC): - name: str = MISSING # type: ignore - _target_: str = MISSING # type: ignore - - # seed for reproducibility - seed: int = 42 + name: str + _target_: str class Benchmark(ABC): + name: str + config: BenchmarkConfig + def __init__(self) -> None: pass def configure(self, config: BenchmarkConfig) -> None: - LOGGER.info(f"Configuring {config.name} benchmark") + LOGGER.info(f"Configuring {self.name} benchmark") self.config = config - LOGGER.info(f"\t+ Setting seed({self.config.seed})") - set_seed(self.config.seed) def run(self, backend: Backend) -> None: raise NotImplementedError("Benchmark must implement run method") - def save(self, path: str = "") -> None: + def save(self) -> None: raise NotImplementedError("Benchmark must implement save method") diff --git a/optimum_benchmark/benchmarks/inference.py b/optimum_benchmark/benchmarks/inference.py index 7f8098470..afded80c4 100644 --- a/optimum_benchmark/benchmarks/inference.py +++ b/optimum_benchmark/benchmarks/inference.py @@ -1,19 +1,38 @@ from dataclasses import dataclass, field -from typing import List, Dict +from typing import List, Dict, Optional from logging import getLogger +from omegaconf import OmegaConf + from pandas import DataFrame import statistics -from optimum_benchmark.backends.base import Backend -from optimum_benchmark.generators.input_generator import InputGenerator -from optimum_benchmark.benchmarks.base import Benchmark, BenchmarkConfig -from optimum_benchmark.trackers.memory import memory_tracker_class_for_backend -from optimum_benchmark.trackers.latency import latency_tracker_class_for_backend + +from ..backends.base import Backend +from .base import Benchmark, BenchmarkConfig +from ..generators.input_generator import InputGenerator +from ..utils import TEXT_GENERATION_TASKS, DIFFUSION_TASKS +from ..trackers.memory import memory_tracker_class_for_backend +from ..trackers.latency import latency_tracker_class_for_backend +from .inference_utils import ( + three_sig_figs, + DEFAULT_INPUT_SHAPES, + DEFAULT_GENERATE_KWARGS, + DEFAULT_DIFUSION_KWARGS, +) LOGGER = getLogger("inference") +OmegaConf.register_new_resolver( + "can_generate", + lambda task: task in TEXT_GENERATION_TASKS, +) +OmegaConf.register_new_resolver( + "can_diffuse", + lambda task: task in DIFFUSION_TASKS, +) + @dataclass class InferenceConfig(BenchmarkConfig): @@ -23,34 +42,69 @@ class InferenceConfig(BenchmarkConfig): # benchmark options memory: bool = False warmup_runs: int = 10 - - benchmark_duration: int = 10 # TODO: deprecate this and use `benchmark.duration` + duration: int = 10 + # TODO: deprecate this and use `benchmark.duration` + benchmark_duration: Optional[int] = None # input options input_shapes: Dict = field( - default_factory=lambda: { - # used with all tasks - "batch_size": 2, - # used with text input tasks - "sequence_length": 16, - # used with multiple choice tasks where input - # is of shape (batch_size, num_choices, sequence_length) - "num_choices": 1, - # used with audio input tasks - "feature_size": 80, - "nb_max_frames": 3000, - "audio_sequence_length": 16000, - } + default_factory=lambda: DEFAULT_INPUT_SHAPES, ) + # TODO: deprecate this and use `benchamrk.generate_kwargs` + new_tokens: Optional[int] = None + + # forward options + can_diffuse: bool = "${can_diffuse:${task}}" + forward_kwargs: Optional[Dict] = None + # generation options - new_tokens: int = 100 # TODO: deprecate this and use `benchamrk.generation_options` + can_generate: bool = "${can_generate:${task}}" + generate_kwargs: Optional[Dict] = None - # diffusion options - # TODO: add `benchmark.diffusion_options` for multiple images per prompt + def __post_init__(self): + if self.can_generate: + self.generate_kwargs = OmegaConf.merge( + self.generate_kwargs or {}, + DEFAULT_GENERATE_KWARGS, + ) + + if self.can_diffuse: + self.forward_kwargs = OmegaConf.merge( + self.forward_kwargs or {}, + DEFAULT_DIFUSION_KWARGS, + ) + + if self.new_tokens is not None: + LOGGER.warning( + "The `new_tokens` option is deprecated, please use `generate_kwargs` " + "instead. `max_new_tokens` and `min_new_tokens` will be set to the " + "value of `new_tokens`." + ) + self.generate_kwargs["max_new_tokens"] = self.new_tokens + self.generate_kwargs["min_new_tokens"] = self.new_tokens + + if self.generate_kwargs is not None: + assert ( + self.generate_kwargs["max_new_tokens"] + == self.generate_kwargs["min_new_tokens"] + ), ( + "`max_new_tokens` and `min_new_tokens` " + "must be equal for fixed length output" + ) + + if self.benchmark_duration is not None: + LOGGER.warning( + "The `benchmark_duration` option is deprecated, please use `duration` " + "instead. `duration` will be set to the value of `benchmark_duration`." + ) + self.duration = self.benchmark_duration class InferenceBenchmark(Benchmark): + name: str = "inference" + config: InferenceConfig + def __init__(self): # initialize inference results self.forward_peak_memory: int = 0 @@ -60,34 +114,30 @@ def __init__(self): def configure(self, config: InferenceConfig): super().configure(config) - self.memory = config.memory - - self.warmup_runs = config.warmup_runs - self.benchmark_duration = config.benchmark_duration + if self.config.forward_kwargs is None: + self.config.forward_kwargs = {} - self.input_shapes = config.input_shapes - self.new_tokens = config.new_tokens + if self.config.generate_kwargs is None: + self.config.generate_kwargs = {} def run(self, backend: Backend) -> None: LOGGER.info("Running inference benchmark") - - self.can_generate = backend.is_text_generation_model() - self.input_shapes.update(backend.model_shapes) + self.config.input_shapes.update(backend.model_shapes) self.input_generator = InputGenerator( task=backend.task, - input_shapes=self.input_shapes, + input_shapes=self.config.input_shapes, pretrained_config=backend.pretrained_config, ) - if self.memory: + if self.config.memory: # if requested, run memory tracking self.run_memory_tracking(backend) # run forward pass tracking self.run_forward_tracking(backend) - if self.can_generate: + if self.config.can_generate: # if possible, run generation pass tracking self.run_generate_tracking(backend) @@ -96,18 +146,17 @@ def run_memory_tracking(self, backend: Backend) -> None: mode="forward", ) - # TODO: handle this in backend using prepare_for_inference for key, value in memory_input.items(): if key == "prompt": continue memory_input[key] = value.to(backend.device) # for backends that require compilation with static shapes - backend.prepare_for_inference(input_shapes=self.input_shapes) + backend.prepare_for_inference(input_shapes=self.config.input_shapes) LOGGER.info("\t+ Tracking forward pass peak memory") memory_tracker = memory_tracker_class_for_backend[backend.config.name](backend) - with memory_tracker.track(interval=self.benchmark_duration // 100): + with memory_tracker.track(interval=self.config.duration // 100): _ = backend.forward(memory_input) self.forward_peak_memory = memory_tracker.get_peak_memory() @@ -118,26 +167,25 @@ def run_forward_tracking(self, backend: Backend) -> None: mode="forward", ) - # TODO: handle this in backend using prepare_for_inference for key, value in forward_input.items(): if key == "prompt": continue forward_input[key] = value.to(backend.device) # for backends that require compilation with static shapes - backend.prepare_for_inference(input_shapes=self.input_shapes) + backend.prepare_for_inference(input_shapes=self.config.input_shapes) LOGGER.info("\t+ Warming up the forward pass") - for _ in range(self.warmup_runs): - _ = backend.forward(forward_input) + for _ in range(self.config.warmup_runs): + _ = backend.forward(forward_input, **self.config.forward_kwargs) LOGGER.info("\t+ Tracking forward pass latency and throughput") latency_tracker = latency_tracker_class_for_backend[backend.config.name]( backend ) - while sum(self.forward_latencies) < self.benchmark_duration: + while sum(self.forward_latencies) < self.config.duration: with latency_tracker.track(): - _ = backend.forward(forward_input) + _ = backend.forward(forward_input, **self.config.forward_kwargs) self.forward_latencies = latency_tracker.get_latencies() LOGGER.info(f"\t+ Forward pass latency: {self.forward_latency:.2e} (s)") @@ -150,7 +198,6 @@ def run_generate_tracking(self, backend: Backend) -> None: mode="forward", ) - # TODO: handle this in backend using prepare_for_inference for key, value in generate_input.items(): if key == "prompt": continue @@ -159,28 +206,18 @@ def run_generate_tracking(self, backend: Backend) -> None: LOGGER.info("\t+ Warming up the generation pass") _ = backend.generate( input=generate_input, - max_new_tokens=self.new_tokens, - min_new_tokens=self.new_tokens, - do_sample=False, - use_cache=True, - pad_token_id=0, - num_beams=1, + **self.config.generate_kwargs, ) LOGGER.info("\t+ Tracking generation latency and throughput") latency_tracker = latency_tracker_class_for_backend[backend.config.name]( backend ) - while sum(self.generate_latencies) < self.benchmark_duration: + while sum(self.generate_latencies) < self.config.duration: with latency_tracker.track(): _ = backend.generate( generate_input, - max_new_tokens=self.new_tokens, - min_new_tokens=self.new_tokens, - do_sample=False, - use_cache=True, - pad_token_id=0, - num_beams=1, + **self.config.generate_kwargs, ) self.generate_latencies = latency_tracker.get_latencies() @@ -192,33 +229,45 @@ def run_generate_tracking(self, backend: Backend) -> None: # Metrics @property + @three_sig_figs def forward_latency(self) -> float: - return significant_figures(statistics.mean(self.forward_latencies)) + return statistics.mean(self.forward_latencies) @property + @three_sig_figs def forward_throughput(self) -> float: - return significant_figures(self.input_shapes.batch_size / self.forward_latency) + return ( + self.config.input_shapes["batch_size"] + * self.config.forward_kwargs["num_images_per_prompt"] + / self.forward_latency + if self.config.can_diffuse + else self.config.input_shapes["batch_size"] / self.forward_latency + ) @property + @three_sig_figs def generate_latency(self) -> float: - return significant_figures(statistics.mean(self.generate_latencies)) + return statistics.mean(self.generate_latencies) @property + @three_sig_figs def generate_throughput(self) -> float: - return significant_figures( - self.new_tokens * self.input_shapes.batch_size / self.generate_latency + return ( + self.config.generate_kwargs["min_new_tokens"] + * self.config.input_shapes["batch_size"] + / self.generate_latency ) def get_results_df(self) -> DataFrame: results_dict = dict() - if self.memory: + if self.config.memory: results_dict["forward.peak_memory(MB)"] = self.forward_peak_memory results_dict["forward.latency(s)"] = self.forward_latency results_dict["forward.throughput(samples/s)"] = self.forward_throughput - if self.can_generate: + if self.config.can_generate: results_dict["generate.latency(s)"] = self.generate_latency results_dict["generate.throughput(tokens/s)"] = self.generate_throughput @@ -228,7 +277,3 @@ def save(self) -> None: LOGGER.info("Saving inference results") results_df = self.get_results_df() results_df.to_csv("inference_results.csv") - - -def significant_figures(x): - return float(f"{x:.3g}") diff --git a/optimum_benchmark/benchmarks/inference_utils.py b/optimum_benchmark/benchmarks/inference_utils.py new file mode 100644 index 000000000..b2280cdc3 --- /dev/null +++ b/optimum_benchmark/benchmarks/inference_utils.py @@ -0,0 +1,37 @@ +DEFAULT_GENERATE_KWARGS = { + "max_new_tokens": 100, + "min_new_tokens": 100, + "do_sample": False, + "use_cache": True, + "pad_token_id": 0, + "num_beams": 1, +} + +DEFAULT_DIFUSION_KWARGS = { + "num_images_per_prompt": 1, +} + +DEFAULT_INPUT_SHAPES = { + # used with all tasks + "batch_size": 2, + # used with text input tasks + "sequence_length": 16, + # used with multiple choice tasks where input + # is of shape (batch_size, num_choices, sequence_length) + "num_choices": 1, + # used with audio input tasks + "feature_size": 80, + "nb_max_frames": 3000, + "audio_sequence_length": 16000, +} + + +def format_float(x: float) -> float: + return float(f"{x:.3g}") + + +def three_sig_figs(func): + def wrapper(*args, **kwargs): + return format_float(func(*args, **kwargs)) + + return wrapper diff --git a/optimum_benchmark/benchmarks/training.py b/optimum_benchmark/benchmarks/training.py index ce0c472fb..6ba1ab20b 100644 --- a/optimum_benchmark/benchmarks/training.py +++ b/optimum_benchmark/benchmarks/training.py @@ -1,24 +1,20 @@ -from typing import Any, Optional, Dict, TYPE_CHECKING +from typing import Any, Dict from dataclasses import dataclass, field from logging import getLogger -from transformers import default_data_collator from omegaconf import OmegaConf from pandas import DataFrame -import torch -from optimum_benchmark.benchmarks.base import Benchmark, BenchmarkConfig -from optimum_benchmark.generators.dataset_generator import DatasetGenerator - -if TYPE_CHECKING: - from optimum_benchmark.backends.base import Backend +from ..backends.base import Backend +from .base import Benchmark, BenchmarkConfig +from ..generators.dataset_generator import DatasetGenerator +from .training_utils import MeasurementCallback, get_data_collator LOGGER = getLogger("training") # resolvers OmegaConf.register_new_resolver("is_cpu", lambda device: device == "cpu") -OmegaConf.register_new_resolver("device_count", lambda: torch.cuda.device_count()) @dataclass @@ -26,6 +22,9 @@ class TrainingConfig(BenchmarkConfig): name: str = "training" _target_: str = "optimum_benchmark.benchmarks.training.TrainingBenchmark" + # training options + warmup_steps: int = 2 + # dataset options dataset_shapes: Dict = field( default_factory=lambda: { @@ -46,146 +45,23 @@ class TrainingConfig(BenchmarkConfig): # training options training_arguments: Dict = field( default_factory=lambda: { - "output_dir": "./trainer_output", + # these are arguments that we set by default + # but can be overwritten by the user "skip_memory_metrics": False, + "output_dir": "./trainer_output", "use_cpu": "${is_cpu:${device}}", + "ddp_find_unused_parameters": False, "do_train": True, "do_eval": False, "do_predict": False, - # add any other training arguments in your config - ###### TrainingArguments ######## - # prediction_loss_only: bool = False, - # per_device_train_batch_size: int = 8, - # per_gpu_train_batch_size: int | None = None, - # gradient_accumulation_steps: int = 1, - # learning_rate: float = 0.00005, - # weight_decay: float = 0, - # adam_beta1: float = 0.9, - # adam_beta2: float = 0.999, - # adam_epsilon: float = 1e-8, - # max_grad_norm: float = 1, - # num_train_epochs: float = 3, - # max_steps: int = -1, - # lr_scheduler_type: SchedulerType | str = "linear", - # warmup_ratio: float = 0, - # warmup_steps: int = 0, - # log_level: str | None = "passive", - # log_level_replica: str | None = "warning", - # log_on_each_node: bool = True, - # logging_dir: str | None = None, - # logging_strategy: IntervalStrategy | str = "steps", - # logging_first_step: bool = False, - # logging_steps: float = 500, - # logging_nan_inf_filter: bool = True, - # save_strategy: IntervalStrategy | str = "steps", - # save_steps: float = 500, - # save_total_limit: int | None = None, - # save_safetensors: bool | None = False, - # save_on_each_node: bool = False, - # use_mps_device: bool = False, - # seed: int = 42, - # data_seed: int | None = None, - # jit_mode_eval: bool = False, - # use_ipex: bool = False, - # bf16: bool = False, - # fp16: bool = False, - # fp16_opt_level: str = "O1", - # half_precision_backend: str = "auto", - # bf16_full_eval: bool = False, - # fp16_full_eval: bool = False, - # tf32: bool | None = None, - # local_rank: int = -1, - # ddp_backend: str | None = None, - # tpu_num_cores: int | None = None, - # tpu_metrics_debug: bool = False, - # debug: str | List[DebugOption] = "", - # dataloader_drop_last: bool = False, - # eval_steps: float | None = None, - # dataloader_num_workers: int = 0, - # past_index: int = -1, - # run_name: str | None = None, - # disable_tqdm: bool | None = None, - # remove_unused_columns: bool | None = True, - # label_names: List[str] | None = None, - # load_best_model_at_end: bool | None = False, - # metric_for_best_model: str | None = None, - # greater_is_better: bool | None = None, - # ignore_data_skip: bool = False, - # sharded_ddp: List[ShardedDDPOption] | str | None = "", - # fsdp: List[FSDPOption] | str | None = "", - # fsdp_min_num_params: int = 0, - # fsdp_config: str | None = None, - # fsdp_transformer_layer_cls_to_wrap: str | None = None, - # deepspeed: str | None = None, - # label_smoothing_factor: float = 0, - # optim: OptimizerNames | str = default_optim, - # optim_args: str | None = None, - # adafactor: bool = False, - # group_by_length: bool = False, - # length_column_name: str | None = "length", - # report_to: List[str] | None = None, - # ddp_find_unused_parameters: bool | None = None, - # ddp_bucket_cap_mb: int | None = None, - # ddp_broadcast_buffers: bool | None = None, - # dataloader_pin_memory: bool = True, - # use_legacy_prediction_loop: bool = False, - # push_to_hub: bool = False, - # resume_from_checkpoint: str | None = None, - # hub_model_id: str | None = None, - # hub_strategy: HubStrategy | str = "every_save", - # hub_token: str | None = None, - # hub_private_repo: bool = False, - # gradient_checkpointing: bool = False, - # include_inputs_for_metrics: bool = False, - # fp16_backend: str = "auto", - # push_to_hub_model_id: str | None = None, - # push_to_hub_organization: str | None = None, - # push_to_hub_token: str | None = None, - # mp_parameters: str = "", - # auto_find_batch_size: bool = False, - # full_determinism: bool = False, - # torchdynamo: str | None = None, - # ray_scope: str | None = "last", - # ddp_timeout: int | None = 1800, - # torch_compile: bool = False, - # torch_compile_backend: str | None = None, - # torch_compile_mode: str | None = None, - # dispatch_batches: bool | None = None } ) - # PyTorch-specific configuration. - use_ddp: bool = False - ddp_config: Optional[Dict] = None - - def __post_init__(self): - if self.use_ddp: - # Copied from https://github.com/pytorch/pytorch/blob/v2.0.0/torch/distributed/launcher/api.py#L29, adjusting to the defaults of torch.distributed.run - ddp_config = { - "min_nodes": 1, - "max_nodes": 1, - "nproc_per_node": "${device_count:}", - "run_id": "none", - "role": "default", - "rdzv_endpoint": "127.0.0.1:29500", - "rdzv_backend": "static", - "rdzv_configs": {"timeout": 900, "rank": 0}, - "max_restarts": 0, - "monitor_interval": 5, - # For the arguments below, the CLI torch.distributed.run matches with LaunchConfig defaults. - # start_method: str = "spawn" - # log_dir: Optional[str] = None - # redirects: Std = Std.NONE - # tee: Std = Std.NONE - # metrics_cfg: Dict[str, str] = field(default_factory=dict) - # local_addr: Optional[str] = None - } - if self.ddp_config is not None: - ddp_config.update(self.ddp_config) - self.ddp_config = ddp_config - class TrainingBenchmark(Benchmark): + name: str = "training" + config: TrainingConfig + def __init__(self): # initialize training results self.training_metrics: Dict[str, Any] = {} @@ -193,46 +69,34 @@ def __init__(self): def configure(self, config: TrainingConfig): super().configure(config) - self.dataset_shapes = config.dataset_shapes - self.training_arguments = config.training_arguments - def run(self, backend: "Backend") -> None: LOGGER.info("Running training benchmark") - model_shapes = backend.model_shapes - self.dataset_shapes = {**self.dataset_shapes, **model_shapes} - - self.dataset_generator = DatasetGenerator( - task=backend.task, - dataset_shapes=self.dataset_shapes, + task = backend.task + dataset_shapes = {**self.config.dataset_shapes, **backend.model_shapes} + dataset_generator = DatasetGenerator(task=task, dataset_shapes=dataset_shapes) + + training_dataset = dataset_generator.generate() + training_data_collator = get_data_collator(task=task) + training_callbacks = [MeasurementCallback(self.config.warmup_steps)] + + trainer_state = backend.train( + training_dataset=training_dataset, + training_callbacks=training_callbacks, + training_data_collator=training_data_collator, + training_arguments=self.config.training_arguments, ) - training_dataset = self.dataset_generator.generate() - - training_data_collator = get_data_collator( - task=backend.task, - ) - - if backend.config.name == "pytorch": - self.training_metrics = backend.run_pytorch_training( - training_config=self.config, - training_arguments=self.training_arguments, - training_dataset=training_dataset, - training_data_collator=training_data_collator, - ) - else: - backend.prepare_for_training( - training_dataset=training_dataset, - training_data_collator=training_data_collator, - training_arguments=self.training_arguments, - ) - training_output = backend.train() - - self.training_metrics = { - "training_throughput": training_output.metrics[ - "train_samples_per_second" - ], - "train_runtime": training_output.metrics["train_runtime"], - } + self.training_metrics = { + # warmup metrics + "warmup_runtime": trainer_state.warmup_runtime, + "warmup_throughput()": trainer_state.warmup_samples_per_second, + # training metrics + "train_runtime": trainer_state.train_runtime, + "training_throughput": trainer_state.train_samples_per_second, + # overall training metrics + "overall_train_runtime": trainer_state.overall_train_runtime, + "overall_training_throughput": trainer_state.overall_train_samples_per_second, + } def get_results_df(self) -> DataFrame: return DataFrame(self.training_metrics, index=[0]) @@ -241,19 +105,3 @@ def save(self) -> None: LOGGER.info("Saving training results") results_df = self.get_results_df() results_df.to_csv("training_results.csv") - - -def get_data_collator(task: str) -> callable: - if task == "object-detection": - return object_detection_data_collator - else: - return default_data_collator - - -def object_detection_data_collator(batch) -> Dict[str, torch.Tensor]: - pixel_values = torch.stack([example["pixel_values"] for example in batch]) - labels = [example["labels"] for example in batch] - return { - "pixel_values": pixel_values, - "labels": labels, - } diff --git a/optimum_benchmark/benchmarks/training_utils.py b/optimum_benchmark/benchmarks/training_utils.py new file mode 100644 index 000000000..097e06c22 --- /dev/null +++ b/optimum_benchmark/benchmarks/training_utils.py @@ -0,0 +1,103 @@ +from typing import Any, Dict, TYPE_CHECKING +from dataclasses import dataclass +import time + +from transformers import default_data_collator +from transformers import TrainerCallback + +if TYPE_CHECKING: + from transformers import TrainerState, TrainingArguments, TrainerControl + + +@dataclass +class MeasurementCallback(TrainerCallback): + warmup_steps: int + + def on_train_begin( + self, + args: "TrainingArguments", + state: "TrainerState", + control: "TrainerControl", + **kwargs, + ): + if state.max_steps <= self.warmup_steps: + # This check is here because max_steps is set only once the training + # is launched, thus we can not check before calling trainer.train(). + raise ValueError( + f"Total training steps {state.max_steps} is smaller " + "than the number of warmup steps {self.warmup_steps}. " + "Please increase the total number of steps (for example by " + "increasing the dataset size)." + ) + + state.warmup_start = time.time_ns() * 1e-9 + state.overall_train_start = time.time_ns() * 1e-9 + + def on_step_begin( + self, + args: "TrainingArguments", + state: "TrainerState", + control: "TrainerControl", + **kwargs, + ): + if state.global_step == self.warmup_steps: + state.warmup_end = time.time_ns() * 1e-9 + state.training_start = time.time_ns() * 1e-9 + elif state.global_step > state.max_steps - 1: + raise ValueError("global_step > state.max_steps - 1") + + def on_train_end( + self, + args: "TrainingArguments", + state: "TrainerState", + control: "TrainerControl", + **kwargs, + ): + state.training_end = time.time_ns() * 1e-9 + state.overall_train_end = time.time_ns() * 1e-9 + + state.total_train_batch_size = ( + args.train_batch_size * args.gradient_accumulation_steps * args.world_size + ) + + # warmup metrics + state.warmup_runtime = state.warmup_end - state.warmup_start + state.num_warmup_samples = self.warmup_steps * state.total_train_batch_size + state.warmup_samples_per_second = ( + state.num_warmup_samples / state.warmup_runtime + ) + # state.warmup_steps_per_second = self.warmup_steps / state.warmup_runtime + + # training metrics + state.train_runtime = state.training_end - state.training_start + state.num_train_steps = state.max_steps - self.warmup_steps + state.num_train_samples = state.num_train_steps * state.total_train_batch_size + state.train_samples_per_second = state.num_train_samples / state.train_runtime + # state.train_steps_per_second = state.num_train_steps / state.train_runtime + + # overall training metrics + state.overall_train_runtime = state.training_end - state.warmup_start + state.overall_train_samples_per_second = ( + state.num_train_samples / state.overall_train_runtime + ) + # state.overall_train_steps_per_second = ( + # state.num_train_steps / state.overall_train_runtime + # ) + + +def get_data_collator(task: str) -> callable: + if task == "object-detection": + return object_detection_data_collator + else: + return default_data_collator + + +def object_detection_data_collator(batch) -> Dict[str, Any]: + import torch + + pixel_values = torch.stack([example["pixel_values"] for example in batch]) + labels = [example["labels"] for example in batch] + return { + "pixel_values": pixel_values, + "labels": labels, + } diff --git a/optimum_benchmark/generators/dataset_generator.py b/optimum_benchmark/generators/dataset_generator.py index 56bcbfc6f..0d5f00e68 100644 --- a/optimum_benchmark/generators/dataset_generator.py +++ b/optimum_benchmark/generators/dataset_generator.py @@ -9,7 +9,7 @@ ) -LOGGER = getLogger("dummy_dataset") +LOGGER = getLogger("dataset_generator") class DatasetGenerator: @@ -23,7 +23,7 @@ def __init__( dataset_shapes["batch_size"] = dataset_shapes.pop("dataset_size") if task in TASKS_TO_GENERATORS: - LOGGER.info(f"Using {TASKS_TO_GENERATORS[task]} generator") + LOGGER.info(f"Using {task} task generator") self.task_generator = TASKS_TO_GENERATORS[task]( shapes=dataset_shapes, with_labels=True, @@ -32,16 +32,15 @@ def __init__( raise NotImplementedError( f"Task {task} is supported. \n" f"Available tasks: {list(TASKS_TO_GENERATORS.keys())}. \n" - "If you want to add support for this task, please submit a PR or a feature request to optimum-benchmark. \n" + "If you want to add support for this task, " + "please submit a PR or a feature request to optimum-benchmark. \n" ) def generate(self) -> Dataset: task_dataset = self.task_generator.generate() - - # TODO: we can move this to backend.prepare_for_training to avoid the torch dependency task_dataset = Dataset.from_dict(task_dataset) task_dataset.set_format( - type="torch", + type="numpy", columns=list(task_dataset.features.keys()), ) diff --git a/optimum_benchmark/generators/input_generator.py b/optimum_benchmark/generators/input_generator.py index e27f3a494..f384abb23 100644 --- a/optimum_benchmark/generators/input_generator.py +++ b/optimum_benchmark/generators/input_generator.py @@ -15,7 +15,7 @@ ) -LOGGER = getLogger("dummy_dataset") +LOGGER = getLogger("input_generator") class InputGenerator: @@ -49,9 +49,11 @@ def __init__( raise NotImplementedError( f"Neither task {task} nor model type {model_type} is supported. \n" f"Available tasks: {list(TASKS_TO_GENERATORS.keys())}. \n" - "If you want to add support for this task, please submit a PR or a feature request to optimum-benchmark. \n" + "If you want to add support for this task, " + "please submit a PR or a feature request to optimum-benchmark. \n" f"Available model types: {SUPPURTED_MODEL_TYPES}. \n" - "If you want to add support for this model type, please submit a PR or a feature request to optimum." + "If you want to add support for this model type, " + "please submit a PR or a feature request to optimum." ) # TODO: we can drop the torch dependency here by returning a dict of numpy arrays diff --git a/optimum_benchmark/import_utils.py b/optimum_benchmark/import_utils.py index 23176229c..cc97c5397 100644 --- a/optimum_benchmark/import_utils.py +++ b/optimum_benchmark/import_utils.py @@ -3,16 +3,22 @@ _torch_available = importlib.util.find_spec("torch") is not None _onnxruntime_available = importlib.util.find_spec("onnxruntime") is not None _is_openvino_available = importlib.util.find_spec("openvino") is not None -_is_neural_compressor_available = importlib.util.find_spec("neural_compressor") is not None +_is_neural_compressor_available = ( + importlib.util.find_spec("neural_compressor") is not None +) + def is_torch_available(): return _torch_available + def is_onnxruntime_available(): return _onnxruntime_available + def is_openvino_available(): return _is_openvino_available + def is_neural_compressor_available(): - return _is_neural_compressor_available \ No newline at end of file + return _is_neural_compressor_available diff --git a/optimum_benchmark/preprocessors/glue.py b/optimum_benchmark/preprocessors/glue.py index ef18158eb..8e7413593 100644 --- a/optimum_benchmark/preprocessors/glue.py +++ b/optimum_benchmark/preprocessors/glue.py @@ -9,6 +9,5 @@ def __call__(self, examples): return self.tokenizer( examples["sentence"], padding="max_length", - max_length=128, truncation=True, ) diff --git a/optimum_benchmark/report.py b/optimum_benchmark/report.py index de31b0b7e..9e12d299e 100644 --- a/optimum_benchmark/report.py +++ b/optimum_benchmark/report.py @@ -16,7 +16,7 @@ def gather_inference_report(root_folder: Path) -> DataFrame: # key is path to inference file as string, value is dataframe inference_dfs = { f.parent.absolute().as_posix(): pd.read_csv(f) - for f in root_folder.glob(f"**/inference_results.csv") + for f in root_folder.glob("**/inference_results.csv") } # key is path to config file as string, value is flattened dict @@ -26,7 +26,7 @@ def gather_inference_report(root_folder: Path) -> DataFrame: flatten(OmegaConf.load(f), reducer="dot"), orient="index" ) .T - for f in root_folder.glob(f"**/hydra_config.yaml") + for f in root_folder.glob("**/hydra_config.yaml") if f.parent.absolute().as_posix() in inference_dfs.keys() } @@ -53,7 +53,7 @@ def style_element(element, style=""): def format_element(element, style=""): - if type(element) == float: + if isinstance(element, float): if element != element: # nan formated_element = "" elif abs(element) >= 1: @@ -64,7 +64,7 @@ def format_element(element, style=""): formated_element = f"{element}" elif element is None: formated_element = "" - elif type(element) == bool: + elif isinstance(element, bool): if element: formated_element = style_element("✔", style="green") else: @@ -295,7 +295,7 @@ def generate_report(): # create reporting directory and title using the filters if report_name is None: report_name = "Inference Report" - reporting_directory = f"reports/inferece_report" + reporting_directory = "reports/inferece_report" else: reporting_directory = f"reports/{report_name}" diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py index 0c2e9812a..98dc93067 100644 --- a/optimum_benchmark/trackers/latency.py +++ b/optimum_benchmark/trackers/latency.py @@ -47,6 +47,7 @@ def _cpu_latency(self): LOGGER.debug(f"Tracked CPU latency: {latency:.2e}s") self.latencies.append(latency) + class PyTorchLatencyTracker(LatencyTracker): def __init__(self, backend): super().__init__(backend) @@ -58,7 +59,9 @@ def __init__(self, backend): self.hf_device_map = None self.end_device = self.device if self.device.type == "cuda": - self.device_indexes = {self.device.index if self.device.index is not None else 0} + self.device_indexes = { + self.device.index if self.device.index is not None else 0 + } def _cuda_latency(self): start_event = torch.cuda.Event(enable_timing=True) diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py index 3cba23ca9..c126321a6 100644 --- a/optimum_benchmark/trackers/memory.py +++ b/optimum_benchmark/trackers/memory.py @@ -40,7 +40,8 @@ def _track_cuda_peak_memory(self): meminfo = nvml.nvmlDeviceGetMemoryInfo(handle) nvml.nvmlShutdown() - # At least for PyTorch, relying on meminfo.used is fine here as PyTorch does not deallocate its cache after running forward. + # At least for PyTorch, relying on meminfo.used is fine + # here as PyTorch does not deallocate its cache after running forward. self.peak_memory = max(self.peak_memory, meminfo.used) LOGGER.debug(f"Peak memory usage: {self.get_peak_memory()} MB") @@ -89,6 +90,7 @@ def run(self): self.connection.send(self.mem_usage) self.connection.close() + class PyTorchMemoryTracker(MemoryTracker): def __init__(self, backend): super().__init__(backend) @@ -97,7 +99,9 @@ def __init__(self, backend): self.hf_device_map = backend.pretrained_model.hf_device_map self.device_indexes = set(self.hf_device_map.values()) else: - self.device_indexes = {self.device.index if self.device.index is not None else 0} + self.device_indexes = { + self.device.index if self.device.index is not None else 0 + } # This variable is used only when CUDA device is used. self.peak_per_device = [0 for _ in range(len(self.device_indexes))] @@ -116,7 +120,7 @@ def _track_cuda_peak_memory(self): meminfo = nvml.nvmlDeviceGetMemoryInfo(handle) self.peak_per_device[i] = max(self.peak_per_device[i], meminfo.used) - + for i, peak_device in enumerate(self.peak_per_device): LOGGER.debug(f"Peak memory {i} usage: {peak_device * 1e-6} MB") diff --git a/optimum_benchmark/utils.py b/optimum_benchmark/utils.py index 16822ef5b..001c2f38f 100644 --- a/optimum_benchmark/utils.py +++ b/optimum_benchmark/utils.py @@ -1,7 +1,6 @@ from typing import Optional, List from logging import getLogger import subprocess -import importlib import platform import random import signal @@ -9,26 +8,17 @@ import re import os -from omegaconf import DictConfig import numpy as np import psutil -import torch LOGGER = getLogger("utils") def set_seed(seed: int) -> None: - # TODO: Should be devided into multiple functions - # each setting seeds for a backend random.seed(seed) np.random.seed(seed) os.environ["PYTHONHASHSEED"] = str(seed) - torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.benchmark = False - def bytes_to_mega_bytes(bytes: int) -> int: # Reference: https://en.wikipedia.org/wiki/Byte#Multiple-byte_units @@ -86,8 +76,8 @@ def check_no_process_is_running_on_cuda_device(device_ids: List[int]) -> None: if subprocess.check_output( [ "nvidia-smi", - f"--query-compute-apps=pid,used_memory", - f"--format=csv,noheader,nounits", + "--query-compute-apps=pid,used_memory", + "--format=csv,noheader,nounits", f"--id={device_id}", ] ) @@ -96,7 +86,8 @@ def check_no_process_is_running_on_cuda_device(device_ids: List[int]) -> None: ] ) - # TODO: It would be safer to run each run of a sweep in a subprocess. Although we can trust PyTorch to clear GPU memory when asked, + # TODO: It would be safer to run each run of a sweep in a subprocess. + # Although we can trust PyTorch to clear GPU memory when asked, # it is not a safe assumption to make for all backends. if len(pids_on_device_id) > 1 or ( len(pids_on_device_id) == 1 and os.getpid() not in pids_on_device_id @@ -138,8 +129,8 @@ def check_only_this_process_is_running_on_cuda_device( if subprocess.check_output( [ "nvidia-smi", - f"--query-compute-apps=pid,used_memory", - f"--format=csv,noheader,nounits", + "--query-compute-apps=pid,used_memory", + "--format=csv,noheader,nounits", f"--id={device_id}", ] ) @@ -148,7 +139,8 @@ def check_only_this_process_is_running_on_cuda_device( ] ) - # check if there is a process running on device_id that is not the current process + # check if there is a process running on + # device_id that is not the current process if len(pids_on_device_id) > 1: os.kill(pid, signal.SIGTERM) raise RuntimeError( @@ -161,58 +153,6 @@ def check_only_this_process_is_running_on_cuda_device( time.sleep(1) -# TODO: move this to onnxruntime backend, the only place using it -def infer_device_id(device: str) -> int: - """ - Infer the device id from the given device string. - """ - - if device == "cuda": - return torch.cuda.current_device() - elif torch.device(device).type == "cuda": - return torch.device(device).index - elif torch.device(device).type == "cpu": - return -1 - else: - raise ValueError(f"Unknown device '{device}'") - - -_NAME_TO_IMPORTPATH = { - "pytorch": "optimum_benchmark.backends.pytorch", - "openvino": "optimum_benchmark.backends.openvino", - "neural_compressor": "optimum_benchmark.backends.neural_compressor", - "onnxruntime": "optimum_benchmark.backends.onnxruntime", - "inference": "optimum_benchmark.benchmarks.inference", - "training": "optimum_benchmark.benchmarks.training", -} - -_NAME_TO_CLASS_NAME = { - "pytorch": "PyTorchConfig", - "openvino": "OVConfig", - "neural_compressor": "INCConfig", - "onnxruntime": "ORTConfig", - "inference": "InferenceConfig", - "training": "TrainingConfig", -} - - -def name_to_dataclass(name: str): - # We use a map name to import path to avoid importing everything here, especially every backend, to avoid to install all backends to run - # optimum-benchmark. - module = importlib.import_module(_NAME_TO_IMPORTPATH[name]) - dataclass_class = getattr(module, _NAME_TO_CLASS_NAME[name]) - return dataclass_class - - -def remap_to_correct_metadata(experiment: DictConfig): - for key, value in experiment.items(): - if isinstance(value, DictConfig) and hasattr(value, "name"): - experiment[key]._metadata.object_type = name_to_dataclass( - experiment[key].name - ) - return experiment - - DIFFUSION_TASKS = [ "stable-diffusion", "stable-diffusion-xl", diff --git a/tests/configs/base_config.yaml b/tests/configs/base_config.yaml index f691dc7c7..517f15c8b 100644 --- a/tests/configs/base_config.yaml +++ b/tests/configs/base_config.yaml @@ -6,7 +6,6 @@ defaults: - _self_ # for hydra 1.1 compatibility - override hydra/job_logging: colorlog # colorful logging - override hydra/hydra_logging: colorlog # colorful logging - - override hydra/launcher: joblib # hydra behavior configuration hydra: @@ -18,14 +17,6 @@ hydra: # we change the working directory during the run/sweep directory # this is useful for saving outputs in a separate directory chdir: true - launcher: - # we set the number of jobs to 2 since when using 1, joblib reuses the same process - n_jobs: 2 - prefer: processes - backend: multiprocessing - sweeper: - # now we force the sweeper to run one job at a time, achieving almost perfect isolation - max_batch_size: 1 backend: initial_isolation_check: false diff --git a/tests/configs/distributed_cuda_pytorch_training_bert_ddp.yaml b/tests/configs/distributed_cuda_pytorch_training_bert_ddp.yaml index cab23bb43..2db9b8661 100644 --- a/tests/configs/distributed_cuda_pytorch_training_bert_ddp.yaml +++ b/tests/configs/distributed_cuda_pytorch_training_bert_ddp.yaml @@ -4,15 +4,16 @@ defaults: - override benchmark: training experiment_name: distributed_cuda_pytorch_training_bert_ddp - -model: bert-base-uncased task: text-classification +model: bert-base-uncased device: cuda -benchmark: +backend: use_ddp: true + +benchmark: dataset_shapes: - dataset_size: 1200 + dataset_size: 120 sequence_length: 256 training_arguments: per_device_train_batch_size: 32 diff --git a/tests/test_cli.py b/tests/test_cli.py index 1ece9daf7..f9d4d39f7 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -32,7 +32,6 @@ def test_single_device_runs(config_file): "tests/configs", "--config-name", config_name, - # "--multirun", # usefull for isolation but makes debugging harder ], capture_output=True, ) @@ -55,7 +54,6 @@ def test_distributed_runs(config_file): "tests/configs", "--config-name", config_name, - # "--multirun", # usefull for isolation but makes debugging harder ], capture_output=True, env=my_env, From ea45d9267617456760178cfaefcc60f326d1556c Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 28 Aug 2023 04:41:37 +0200 Subject: [PATCH 4/8] now hydra can set cuda visible devices + better typing --- .gitignore | 1 + optimum_benchmark/backends/base.py | 205 +++---- .../backends/neural_compressor.py | 195 ------- .../{utils => neural_compressor}/__init__.py | 0 .../backends/neural_compressor/backend.py | 107 ++++ .../backends/neural_compressor/config.py | 88 +++ .../backends/neural_compressor/utils.py | 5 + optimum_benchmark/backends/onnxruntime.py | 505 ------------------ .../backends/onnxruntime/__init__.py | 0 .../backends/onnxruntime/backend.py | 332 ++++++++++++ .../backends/onnxruntime/config.py | 185 +++++++ .../backends/onnxruntime/utils.py | 40 ++ optimum_benchmark/backends/openvino.py | 190 ------- .../backends/openvino/__init__.py | 0 .../backends/openvino/backend.py | 119 +++++ optimum_benchmark/backends/openvino/config.py | 64 +++ optimum_benchmark/backends/openvino/utils.py | 3 + .../backends/{utils => }/optimum_utils.py | 232 ++++---- optimum_benchmark/backends/pytorch.py | 451 ---------------- .../backends/pytorch/__init__.py | 0 optimum_benchmark/backends/pytorch/backned.py | 265 +++++++++ optimum_benchmark/backends/pytorch/config.py | 143 +++++ optimum_benchmark/backends/pytorch/utils.py | 35 ++ optimum_benchmark/backends/utils.py | 176 ++++++ .../backends/utils/base_utils.py | 92 ---- .../backends/utils/neural_compressor_utils.py | 39 -- .../backends/utils/onnxruntime_utils.py | 94 ---- .../backends/utils/openvino_utils.py | 14 - .../backends/utils/pytorch_utils.py | 78 --- optimum_benchmark/benchmarks/base.py | 18 +- optimum_benchmark/benchmarks/inference.py | 192 +++---- .../benchmarks/inference_utils.py | 37 -- optimum_benchmark/benchmarks/training.py | 29 +- .../benchmarks/training_utils.py | 103 ---- optimum_benchmark/benchmarks/utils.py | 87 +++ optimum_benchmark/env_utils.py | 38 ++ optimum_benchmark/experiment.py | 83 ++- .../generators/dataset_generator.py | 9 +- .../generators/input_generator.py | 50 +- .../generators/model_type_generator.py | 33 +- .../generators/task_generator.py | 27 +- optimum_benchmark/import_utils.py | 4 +- optimum_benchmark/profilers/fx_profiler.py | 13 +- optimum_benchmark/profilers/ort_profiler.py | 12 +- optimum_benchmark/report.py | 77 +-- optimum_benchmark/task_utils.py | 39 ++ optimum_benchmark/trackers/latency.py | 8 +- optimum_benchmark/trackers/memory.py | 28 +- optimum_benchmark/utils.py | 195 ------- pyproject.toml | 22 + requirements.txt | 1 + setup.py | 4 +- tests/configs/base_config.yaml | 17 +- ...stributed_cuda_pytorch_inference_gpt2.yaml | 5 +- ...ibuted_cuda_pytorch_training_bert_ddp.yaml | 8 +- ...ributed_cuda_pytorch_training_bert_dp.yaml | 2 +- tests/test_cli.py | 44 +- 57 files changed, 2212 insertions(+), 2631 deletions(-) delete mode 100644 optimum_benchmark/backends/neural_compressor.py rename optimum_benchmark/backends/{utils => neural_compressor}/__init__.py (100%) create mode 100644 optimum_benchmark/backends/neural_compressor/backend.py create mode 100644 optimum_benchmark/backends/neural_compressor/config.py create mode 100644 optimum_benchmark/backends/neural_compressor/utils.py delete mode 100644 optimum_benchmark/backends/onnxruntime.py create mode 100644 optimum_benchmark/backends/onnxruntime/__init__.py create mode 100644 optimum_benchmark/backends/onnxruntime/backend.py create mode 100644 optimum_benchmark/backends/onnxruntime/config.py create mode 100644 optimum_benchmark/backends/onnxruntime/utils.py delete mode 100644 optimum_benchmark/backends/openvino.py create mode 100644 optimum_benchmark/backends/openvino/__init__.py create mode 100644 optimum_benchmark/backends/openvino/backend.py create mode 100644 optimum_benchmark/backends/openvino/config.py create mode 100644 optimum_benchmark/backends/openvino/utils.py rename optimum_benchmark/backends/{utils => }/optimum_utils.py (67%) delete mode 100644 optimum_benchmark/backends/pytorch.py create mode 100644 optimum_benchmark/backends/pytorch/__init__.py create mode 100644 optimum_benchmark/backends/pytorch/backned.py create mode 100644 optimum_benchmark/backends/pytorch/config.py create mode 100644 optimum_benchmark/backends/pytorch/utils.py create mode 100644 optimum_benchmark/backends/utils.py delete mode 100644 optimum_benchmark/backends/utils/base_utils.py delete mode 100644 optimum_benchmark/backends/utils/neural_compressor_utils.py delete mode 100644 optimum_benchmark/backends/utils/onnxruntime_utils.py delete mode 100644 optimum_benchmark/backends/utils/openvino_utils.py delete mode 100644 optimum_benchmark/backends/utils/pytorch_utils.py delete mode 100644 optimum_benchmark/benchmarks/inference_utils.py delete mode 100644 optimum_benchmark/benchmarks/training_utils.py create mode 100644 optimum_benchmark/benchmarks/utils.py create mode 100644 optimum_benchmark/env_utils.py create mode 100644 optimum_benchmark/task_utils.py delete mode 100644 optimum_benchmark/utils.py create mode 100644 pyproject.toml diff --git a/.gitignore b/.gitignore index 373be35de..dd49b40dd 100644 --- a/.gitignore +++ b/.gitignore @@ -159,6 +159,7 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +.ruff_cache/ .vscode/ *.ipynb runs/ diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py index 747e9ac37..6c71d0357 100644 --- a/optimum_benchmark/backends/base.py +++ b/optimum_benchmark/backends/base.py @@ -1,55 +1,52 @@ -from typing import Any, ClassVar, Dict, List, Optional, Union, TYPE_CHECKING -from multiprocessing import Process -from abc import abstractmethod, ABC -from dataclasses import dataclass -from logging import getLogger -import os import gc - - +import os +import random import shutil -from psutil import cpu_count -from diffusers import DiffusionPipeline -from optimum.exporters import TasksManager -from transformers import ( - AutoConfig, - AutoProcessor, - ProcessorMixin, - PreTrainedModel, - PretrainedConfig, - PreTrainedTokenizer, - ImageProcessingMixin, - FeatureExtractionMixin, +from abc import ABC +from dataclasses import dataclass +from logging import getLogger +from multiprocessing import Process +from typing import ( + TYPE_CHECKING, + Any, + Callable, + ClassVar, + Dict, + Generic, + List, + Optional, + TypeVar, + Union, ) +import numpy as np +import torch +from optimum.exporters import TasksManager +from psutil import cpu_count +from transformers import AutoConfig, AutoProcessor if TYPE_CHECKING: + from datasets import Dataset + from transformers import ( + Pipeline, + PretrainedConfig, + PreTrainedModel, + TrainerCallback, + TrainerState, + ) from transformers.utils import ModelOutput - from transformers import TrainerState + from .utils import PreTrainedProcessor -from .utils.base_utils import ( - extract_shapes_from_diffusion_pipeline, - extract_shapes_from_model_artifacts, -) -from ..utils import ( - DIFFUSION_TASKS, - TEXT_GENERATION_TASKS, +from ..task_utils import DIFFUSION_TASKS, TEXT_GENERATION_TASKS +from .utils import ( check_no_process_is_running_on_cuda_device, check_only_this_process_is_running_on_cuda_device, + extract_shapes_from_diffusion_pipeline, + extract_shapes_from_model_artifacts, ) -LOGGER = getLogger("backend") - -PreTrainedProcessor = Union[ - PreTrainedTokenizer, - ImageProcessingMixin, - FeatureExtractionMixin, - ProcessorMixin, -] - - @dataclass class BackendConfig(ABC): name: str @@ -57,6 +54,7 @@ class BackendConfig(ABC): _target_: str # backend options + seed: int = 42 inter_op_num_threads: Optional[int] = None intra_op_num_threads: Optional[int] = None @@ -77,19 +75,25 @@ def __post_init__(self): self.intra_op_num_threads = cpu_count() -class Backend(ABC): - name: str - config: ClassVar[BackendConfig] +LOGGER = getLogger("backend") + +BackendConfigT = TypeVar("BackendConfigT", bound=BackendConfig) + + +class Backend(Generic[BackendConfigT], ABC): + NAME: ClassVar[str] - pretrained_model: Union[PreTrainedModel, DiffusionPipeline] - pretrained_processor: Optional[PreTrainedProcessor] - pretrained_config: Optional[PretrainedConfig] + # instance variables withouth default values https://stackoverflow.com/a/44962662 + config: BackendConfigT + pretrained_model: Union["PreTrainedModel", "Pipeline"] + pretrained_processor: Optional["PreTrainedProcessor"] + pretrained_config: Optional["PretrainedConfig"] def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any]): - self.model = model self.task = task - self.device = device + self.model = model self.hub_kwargs = hub_kwargs + self.device = torch.device(device) if self.is_diffusion_pipeline(): # for pipelines @@ -99,8 +103,7 @@ def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any else: # for models self.pretrained_config = AutoConfig.from_pretrained( - pretrained_model_name_or_path=self.model, - **self.hub_kwargs, + pretrained_model_name_or_path=self.model, **self.hub_kwargs ) self.model_type = self.pretrained_config.model_type @@ -108,18 +111,15 @@ def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any # the processor sometimes contains information about the model's # input shapes that's not available in the config self.pretrained_processor = AutoProcessor.from_pretrained( - pretrained_model_name_or_path=self.model, - **self.hub_kwargs, + pretrained_model_name_or_path=self.model, **self.hub_kwargs ) except ValueError: LOGGER.warning("Could not find the model's preprocessor") self.pretrained_processor = None - # we're using this one as the default model_class which is used - # for exporting the model to onnx for example. Although does suppose that - # the model weights are pytorch weights so we might need to change somehow. self.automodel_class = TasksManager.get_model_class_for_task( task=self.task, + framework="pt", model_type=self.model_type, ) @@ -131,18 +131,15 @@ def is_diffusion_pipeline(self) -> bool: def check_initial_isolation(self) -> None: if self.device.type == "cuda": - cuda_devices = os.environ.get("CUDA_VISIBLE_DEVICES") + cuda_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None) if cuda_devices is None: LOGGER.warning( - "Asked to check the initial device isolation, " - "but the variable CUDA_VISIBLE_DEVICES was not set. " - "Defaulting to checking on the first device." + "Asked to check the initial device(s) isolation, but the variable CUDA_VISIBLE_DEVICES was not set. " + "Defaulting to checking the main device only." ) device_ids = {self.device.index if self.device.index is not None else 0} else: - device_ids = { - int(device_index) for device_index in cuda_devices.split(",") - } + device_ids = {int(device_index) for device_index in cuda_devices.split(",")} check_no_process_is_running_on_cuda_device(device_ids) def check_continuous_isolation(self) -> None: @@ -150,15 +147,12 @@ def check_continuous_isolation(self) -> None: cuda_devices = os.environ.get("CUDA_VISIBLE_DEVICES") if cuda_devices is None: LOGGER.warning( - "Asked to check the continuous device isolation, " - "but the variable CUDA_VISIBLE_DEVICES was not set. " - "Defaulting to checking on the first device." + "Asked to check the continuous device(s) isolation, but the variable CUDA_VISIBLE_DEVICES was not set. " + "Defaulting to checking the main device only." ) device_ids = {self.device.index if self.device.index is not None else 0} else: - device_ids = { - int(device_index) for device_index in cuda_devices.split(",") - } + device_ids = {int(device_index) for device_index in cuda_devices.split(",")} self.isolation_thread = Process( target=check_only_this_process_is_running_on_cuda_device, @@ -167,23 +161,36 @@ def check_continuous_isolation(self) -> None: ) self.isolation_thread.start() - @abstractmethod - def configure(self, config: BackendConfig) -> None: - LOGGER.info(f"Configuring {config.name} backend") + def configure(self, config: BackendConfigT) -> None: + LOGGER.info(f"Configuring {self.NAME} backend") + # storing config self.config = config + # seeding backend + self.seed() + # isolation options if self.config.initial_isolation_check: - LOGGER.info("\t+ Checking initial device isolation") + LOGGER.info("\t+ Checking initial device(s) isolation") self.check_initial_isolation() if self.config.continous_isolation_check: - LOGGER.info("\t+ Checking contineous device isolation") + LOGGER.info("\t+ Checking contineous device(s) isolation") self.check_continuous_isolation() # clean up options if self.config.delete_cache: LOGGER.info("\t+ Model cache will be deleted after benchmark") + def seed(self) -> None: + # https://pytorch.org/docs/stable/notes/randomness.html + random.seed(self.config.seed) + np.random.seed(self.config.seed) + torch.manual_seed(self.config.seed) + torch.cuda.manual_seed_all(self.config.seed) # safe to call + # torch.use_deterministic_algorithms() # might throw an error + # torch.backends.cudnn.deterministic = True # same as above + # torch.backends.cudnn.benchmark = False # might reduce performance + # compiling in openvino requires input shapes def prepare_for_inference(self, input_shapes: Dict[str, int]) -> Dict[str, Any]: pass @@ -193,37 +200,20 @@ def prepare_for_profiling(self, input_names: List[str]) -> Dict[str, Any]: pass def forward(self, input: Dict[str, Any], **kwargs) -> "ModelOutput": - raise NotImplementedError("Backend must implement forward method") + return self.pretrained_model(**input, **kwargs) def generate(self, input: Dict[str, Any], **kwargs) -> "ModelOutput": - raise NotImplementedError("Backend must implement generate method") - - def train(self) -> "TrainerState": + return self.pretrained_model.generate(**input, **kwargs) + + def train( + self, + training_dataset: "Dataset", + training_arguments: Dict[str, Any], + training_callbacks: List["TrainerCallback"], + training_data_collator: Callable, + ) -> "TrainerState": raise NotImplementedError("Backend must implement train method") - def delete_pretrained_model(self) -> None: - try: - del self.pretrained_model - except AttributeError: - # benchmark might fail before the model is loaded - pass - - gc.collect() - - def delete_model_cache(self) -> None: - model_cache_path = "models--" + self.model.replace("/", "--") - model_cache_path = os.path.join( - os.path.expanduser("~/.cache/huggingface/hub"), model_cache_path - ) - shutil.rmtree(model_cache_path, ignore_errors=True) - - def clean(self) -> None: - LOGGER.info(f"Cleaning {self.config.name} backend") - self.delete_pretrained_model() - - if self.config.delete_cache: - self.delete_model_cache() - @property def model_shapes(self) -> Dict[str, int]: if self.is_diffusion_pipeline(): @@ -237,3 +227,22 @@ def model_shapes(self) -> Dict[str, int]: ) return model_shapes + + def delete_pretrained_model(self) -> None: + if hasattr(self, "pretrained_model"): + del self.pretrained_model + + gc.collect() + + def delete_model_cache(self) -> None: + LOGGER.info("\t+ Deleting model cache") + model_cache_path = f"models/{self.model}".replace("/", "--") + model_cache_path = os.path.join(os.path.expanduser("~/.cache/huggingface/hub"), model_cache_path) + shutil.rmtree(model_cache_path, ignore_errors=True) + + def clean(self) -> None: + LOGGER.info(f"Cleaning {self.NAME} backend") + self.delete_pretrained_model() + + if self.config.delete_cache: + self.delete_model_cache() diff --git a/optimum_benchmark/backends/neural_compressor.py b/optimum_benchmark/backends/neural_compressor.py deleted file mode 100644 index a1ac95f73..000000000 --- a/optimum_benchmark/backends/neural_compressor.py +++ /dev/null @@ -1,195 +0,0 @@ -from typing import Dict, Optional, Any, TYPE_CHECKING -from tempfile import TemporaryDirectory -from dataclasses import dataclass -from logging import getLogger - -import torch -from torch import Tensor -from hydra.utils import get_class -from omegaconf import DictConfig, OmegaConf -from optimum.intel.neural_compressor.quantization import INCQuantizer -from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS -from neural_compressor import __version__ as neural_compressor_version -from neural_compressor.config import ( - AccuracyCriterion, - TuningCriterion, - PostTrainingQuantConfig, -) - -if TYPE_CHECKING: - from transformers.utils import ModelOutput - -from .base import Backend, BackendConfig -from .utils.neural_compressor_utils import ( - DEFAULT_QUANTIZATION_CONFIG, - DEFAULT_CALIBRATION_CONFIG, -) - - -LOGGER = getLogger("neural_compressor") - -OmegaConf.register_new_resolver("ptq_is_static", lambda approach: approach == "static") - - -@dataclass -class INCConfig(BackendConfig): - name: str = "neural_compressor" - version: str = neural_compressor_version - _target_: str = "optimum_benchmark.backends.neural_compressor.INCBackend" - - # export options - no_weights: bool = False - - # quantization options - quantization: bool = False - quantization_config: Optional[Dict[str, Any]] = None - - # calibration options - calibration: bool = False - calibration_config: Optional[Dict[str, Any]] = None - - def __post_init__(self): - if self.no_weights: - # TODO: implement no_weights for neural_compressor backend if possible - raise NotImplementedError( - "no_weights is not supported for neural_compressor backend" - ) - - if self.quantization: - self.quantization_config = OmegaConf.merge( - self.quantization_config if self.quantization_config else {}, - DEFAULT_QUANTIZATION_CONFIG, - ) - if self.calibration_config["approach"] == "static": - self.calibration = True - - if self.calibration: - self.calibration_config = OmegaConf.merge( - self.calibration_config if self.calibration_config else {}, - DEFAULT_CALIBRATION_CONFIG, - ) - - -class INCBackend(Backend): - name: str = "neural_compressor" - config: INCConfig - - def __init__( - self, model: str, task: str, device: str, hub_kwargs: DictConfig - ) -> None: - super().__init__(model, task, device, hub_kwargs) - self.device = torch.device(device) - - assert self.task in _HEAD_TO_AUTOMODELS, ( - f"INCBackend does not support task {self.task} yet. " - f"Supported tasks are: {list(_HEAD_TO_AUTOMODELS.keys())}" - ) - - self.incmodel_class = get_class( - f"optimum.intel.neural_compressor.{_HEAD_TO_AUTOMODELS[self.task]}" - ) - LOGGER.info( - f"\t+ Infered INCModel class {self.incmodel_class.__name__} " - f"for task {self.task} and model_type {self.model_type}" - ) - - def configure(self, config: INCConfig) -> None: - super().configure(config) - - if self.config.quantization: - self.config.quantization_config["accuracy_criterion"] = AccuracyCriterion( - **self.config.quantization_config["accuracy_criterion"] - ) - self.config.quantization_config["tuning_criterion"] = TuningCriterion( - **self.config.quantization_config["tuning_criterion"] - ) - self.quantization_config = PostTrainingQuantConfig( - **self.config.quantization_config - ) - - if self.config.calibration: - self.config.calibration_config["preprocess_class"] = get_class( - self.config.calibration_config["preprocess_class"] - ) - self.config.calibration_config[ - "preprocess_function" - ] = self.config.calibration_config["preprocess_class"]( - model_name_or_path=self.model - ) - self.config.calibration_config.pop("preprocess_class") - - with TemporaryDirectory() as tmpdirname: - if self.config.quantization: - self.load_and_quantize_automodel(tmpdirname) - else: - self.load_incmodel() - - def load_and_quantize_automodel(self, tmpdirname: str) -> None: - LOGGER.info("\t+ Loading pretrained AutoModel") - model = self.automodel_class.from_pretrained(self.model, **self.hub_kwargs) - LOGGER.info("\t+ Creating quantizer") - quantizer = INCQuantizer.from_pretrained( - model, - eval_fn=None, - calibration_fn=None, - task=self.task, - ) - - if self.config.calibration: - LOGGER.info("\t+ Loading calibration dataset") - calibration_dataset = quantizer.get_calibration_dataset( - **self.config.calibration_config - ) - else: - calibration_dataset = None - - LOGGER.info("\t+ Attempting quantization") - quantizer.quantize( - quantization_config=self.config.quantization_config, - save_directory=f"{tmpdirname}/quantized", - calibration_dataset=calibration_dataset, - # default values - batch_size=8, - data_collator=None, - remove_unused_columns=True, - file_name=None, - ) - - LOGGER.info("\t+ Loading quantized INCModel") - self.pretrained_model = self.incmodel_class.from_pretrained( - model_name_or_path=f"{tmpdirname}/quantized", - ) - - def load_incmodel(self) -> None: - if self.is_diffusion_pipeline(): - self.pretrained_model = self.incmodel_class.from_pretrained( - model_name_or_path=self.model, - **self.hub_kwargs, - ) - self.pretrained_model.to(self.device) - elif self.is_text_generation_model(): - self.pretrained_model = self.incmodel_class.from_pretrained( - # for some reason only causalLM expects - # model_id instead of model_name_or_path - model_id=self.model, - device_map=self.device, - **self.hub_kwargs, - ) - else: - self.pretrained_model = self.incmodel_class.from_pretrained( - # for some reason only causalLM expects - # model_id instead of model_name_or_path - model_name_or_path=self.model, - device_map=self.device, - **self.hub_kwargs, - ) - - def forward(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput": - output = self.pretrained_model(**input, **kwargs) - - return output - - def generate(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput": - output = self.pretrained_model.generate(**input, **kwargs) - - return output diff --git a/optimum_benchmark/backends/utils/__init__.py b/optimum_benchmark/backends/neural_compressor/__init__.py similarity index 100% rename from optimum_benchmark/backends/utils/__init__.py rename to optimum_benchmark/backends/neural_compressor/__init__.py diff --git a/optimum_benchmark/backends/neural_compressor/backend.py b/optimum_benchmark/backends/neural_compressor/backend.py new file mode 100644 index 000000000..5e35dffb4 --- /dev/null +++ b/optimum_benchmark/backends/neural_compressor/backend.py @@ -0,0 +1,107 @@ +from logging import getLogger +from tempfile import TemporaryDirectory +from typing import Any, Dict + +from hydra.utils import get_class +from neural_compressor.config import ( + AccuracyCriterion, + PostTrainingQuantConfig, + TuningCriterion, +) +from optimum.intel.neural_compressor.quantization import INCQuantizer + +from ..base import Backend +from .config import INCConfig +from .utils import TASKS_TO_INCMODELS + +LOGGER = getLogger("neural_compressor") + + +class INCBackend(Backend[INCConfig]): + NAME: str = "neural_compressor" + + def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any]) -> None: + super().__init__(model, task, device, hub_kwargs) + self.validate_device() + self.validate_task() + + self.incmodel_class = get_class(TASKS_TO_INCMODELS[self.task]) + LOGGER.info( + f"\t+ Infered INCModel {self.incmodel_class.__name__} for task {self.task} and model_type {self.model_type}" + ) + + def validate_device(self) -> None: + if self.device.type != "cpu": + raise ValueError(f"INCBackend only supports CPU devices, got {self.device.type}") + + def validate_task(self) -> None: + if self.task not in TASKS_TO_INCMODELS: + raise NotImplementedError(f"INCBackend does not support task {self.task}") + + def configure(self, config: INCConfig) -> None: + super().configure(config) + + self.tmpdir = TemporaryDirectory() + + if self.config.ptq_quantization: + self.load_automodel_from_pretrained() + self.quantize_automodel() + self.delete_pretrained_model() + + self.load_incmodel_from_pretrained() + + def load_automodel_from_pretrained(self) -> None: + LOGGER.info("\t+ Loading AutoModel") + self.pretrained_model = self.automodel_class.from_pretrained(self.model, **self.hub_kwargs) + + def load_incmodel_from_pretrained(self) -> None: + LOGGER.info("\t+ Loading INCModel") + self.pretrained_model = self.incmodel_class.from_pretrained(self.model, **self.hub_kwargs) + + def quantize_automodel(self) -> None: + LOGGER.info("\t+ Attempting to quantize model") + quantized_model_path = f"{self.tmpdir.name}/quantized" + LOGGER.info("\t+ Processing quantization config") + ptq_quantization_config = self.config.ptq_quantization_config.copy() + ptq_quantization_config["accuracy_criterion"] = AccuracyCriterion( + **ptq_quantization_config["accuracy_criterion"] + ) + ptq_quantization_config["tuning_criterion"] = TuningCriterion(**ptq_quantization_config["tuning_criterion"]) + ptq_quantization_config = PostTrainingQuantConfig(**ptq_quantization_config) + LOGGER.info("\t+ Creating quantizer") + quantizer = INCQuantizer.from_pretrained( + self.pretrained_model, + task=self.task, + seed=self.config.seed, + # TODO: add support for these + eval_fn=None, + calibration_fn=None, + ) + + if self.config.calibration: + LOGGER.info("\t+ Processing calibration config") + calibration_config = self.config.calibration_config.copy() + preprocess_class = get_class(calibration_config.pop("preprocess_class")) + calibration_config["preprocess_function"] = preprocess_class(model_name_or_path=self.model) + LOGGER.info("\t+ Loading calibration dataset") + calibration_dataset = quantizer.get_calibration_dataset(**calibration_config) + else: + calibration_dataset = None + + LOGGER.info("\t+ Quantizing model") + quantizer.quantize( + quantization_config=ptq_quantization_config, + save_directory=quantized_model_path, + calibration_dataset=calibration_dataset, + # TODO: add support for these + remove_unused_columns=True, + data_collator=None, + file_name=None, + batch_size=8, + ) + self.model = quantized_model_path + + def clean(self) -> None: + super().clean() + if hasattr(self, "tmpdir"): + self.tmpdir.cleanup() diff --git a/optimum_benchmark/backends/neural_compressor/config.py b/optimum_benchmark/backends/neural_compressor/config.py new file mode 100644 index 000000000..1a1fcb845 --- /dev/null +++ b/optimum_benchmark/backends/neural_compressor/config.py @@ -0,0 +1,88 @@ +import importlib.metadata +from dataclasses import dataclass, field +from typing import Any, Dict + +from omegaconf import OmegaConf + +from ..base import BackendConfig + +OmegaConf.register_new_resolver( + "neural_compressor_version", + lambda: importlib.metadata.version("neural_compressor"), +) + +# https://github.com/intel/neural-compressor/blob/master/neural_compressor/config.py#L490 +ACCURACY_CRITERION_CONFIG = { + "higher_is_better": True, + "criterion": "relative", + "tolerable_loss": 0.01, +} + +# https://github.com/intel/neural-compressor/blob/master/neural_compressor/config.py#L593 +TUNING_CRITERION_CONFIG = { + "strategy": "basic", + "strategy_kwargs": None, + "timeout": 0, + "max_trials": 100, + "objective": "performance", +} + +# https://github.com/intel/neural-compressor/blob/master/neural_compressor/config.py#L1242 +PTQ_QUANTIZATION_CONFIG = { + "device": "cpu", + "backend": "default", + "domain": "auto", + "recipes": {}, + "quant_format": "default", + "inputs": [], + "outputs": [], + "approach": "static", + "calibration_sampling_size": [100], + "op_type_dict": None, + "op_name_dict": None, + "reduce_range": None, + "example_inputs": None, + "excluded_precisions": [], + "quant_level": "auto", + "accuracy_criterion": ACCURACY_CRITERION_CONFIG, + "tuning_criterion": TUNING_CRITERION_CONFIG, + "diagnosis": False, +} + + +CALIBRATION_CONFIG = { + "dataset_name": "glue", + "num_samples": 300, + "dataset_config_name": "sst2", + "dataset_split": "train", + "preprocess_batch": True, + "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor", +} + + +@dataclass +class INCConfig(BackendConfig): + name: str = "neural_compressor" + version: str = "${neural_compressor_version:}" + _target_: str = "optimum_benchmark.backends.neural_compressor.backend.INCBackend" + + # post-training quantization options + ptq_quantization: bool = False + ptq_quantization_config: Dict[str, Any] = field(default_factory=dict) + + # calibration options + calibration: bool = False + calibration_config: Dict[str, Any] = field(default_factory=dict) + + def __post_init__(self): + if self.ptq_quantization: + self.ptq_quantization_config = OmegaConf.to_container( + OmegaConf.merge(PTQ_QUANTIZATION_CONFIG, self.ptq_quantization_config) + ) + if self.ptq_quantization_config["approach"] == "static" and not self.calibration: + raise ValueError("Calibration must be enabled when using static quantization.") + + if self.calibration: + self.calibration_config = OmegaConf.to_container( + OmegaConf.merge(CALIBRATION_CONFIG, self.calibration_config) + ) diff --git a/optimum_benchmark/backends/neural_compressor/utils.py b/optimum_benchmark/backends/neural_compressor/utils.py new file mode 100644 index 000000000..beb999771 --- /dev/null +++ b/optimum_benchmark/backends/neural_compressor/utils.py @@ -0,0 +1,5 @@ +from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS + +TASKS_TO_INCMODELS = { + task: f"optimum.intel.neural_compressor.{incmodel_name}" for task, incmodel_name in _HEAD_TO_AUTOMODELS.items() +} diff --git a/optimum_benchmark/backends/onnxruntime.py b/optimum_benchmark/backends/onnxruntime.py deleted file mode 100644 index 57e811706..000000000 --- a/optimum_benchmark/backends/onnxruntime.py +++ /dev/null @@ -1,505 +0,0 @@ -from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING -from tempfile import TemporaryDirectory -from dataclasses import dataclass -from logging import getLogger -from datasets import Dataset -import os - - -import torch -from torch import Tensor -from omegaconf import OmegaConf -from hydra.utils import get_class -from onnxruntime import SessionOptions -from accelerate import init_empty_weights -from optimum.pipelines import ORT_SUPPORTED_TASKS -from onnxruntime import __version__ as onnxruntime_version -from optimum.onnxruntime.configuration import ( - OptimizationConfig, - QuantizationConfig, - AutoCalibrationConfig, - AutoOptimizationConfig, - AutoQuantizationConfig, -) -from optimum.onnxruntime import ( - ORTOptimizer, - ORTQuantizer, - ORTTrainer, - ORTTrainingArguments, -) - -if TYPE_CHECKING: - from transformers import TrainerCallback, TrainerState - from transformers.modeling_outputs import ModelOutput - - -from .base import Backend, BackendConfig -from .utils.optimum_utils import main_export -from .utils.pytorch_utils import randomize_weights -from ..profilers.ort_profiler import ORTProfilingWrapper -from .utils.onnxruntime_utils import ( - format_ort_quantization_dict, - infer_device_id, - DEFAULT_OPTIMIZATION_CONFIG, - DEFAULT_QUANTIZATION_CONFIG, - DEFAULT_CALIBRATION_CONFIG, -) - - -OmegaConf.register_new_resolver( - "is_gpu", - lambda device: "cuda" in device.lower() or "tensorrt" in device.lower(), -) -OmegaConf.register_new_resolver( - "is_profiling", - lambda benchmark_name: benchmark_name == "profiling", -) -OmegaConf.register_new_resolver( - "infer_provider", - lambda device: f"{torch.device(device).type.upper()}ExecutionProvider", -) -OmegaConf.register_new_resolver( - "infer_device_id", - lambda device: infer_device_id(device), -) - -LOGGER = getLogger("onnxruntime") - - -@dataclass -class ORTConfig(BackendConfig): - name: str = "onnxruntime" - version: str = onnxruntime_version - _target_: str = "optimum_benchmark.backends.onnxruntime.ORTBackend" - - # export options - export: bool = True - no_weights: bool = False - use_merged: bool = False - use_cache: bool = True - torch_dtype: Optional[str] = None - - # provider options - provider: str = "${infer_provider:${device}}" - provider_options: Optional[Dict] = None - # TODO: deprecate device_id in favor of provider_options - device_id: Optional[int] = "${infer_device_id:${device}}" - - # inference options - use_io_binding: bool = "${is_gpu:${device}}" - session_options: Optional[Dict] = None - # TODO: deprecate enable_profiling in favor of session_options - enable_profiling: bool = "${is_profiling:${benchmark.name}}" - - # optimization options - optimization: bool = False - optimization_config: Optional[Dict] = None - - # O1, O2, O3, O4 - auto_optimization: Optional[str] = None - auto_optimization_config: Optional[Dict] = None - - # quantization options - quantization: bool = False - quantization_config: Optional[Dict] = None - - # arm64,avx2,avx512,avx512_vnni,tensorrt - auto_quantization: Optional[str] = None - auto_quantization_config: Optional[Dict] = None - - # calibration options - calibration: bool = False - calibration_config: Optional[Dict] = None - - # this will skip exporting the model and will use automodel with trainer - use_ortmodel: bool = "${is_inference:${benchmark.name}}" - - def __post_init__(self): - if self.optimization: - self.optimization_config = OmegaConf.merge( - self.optimization_config or {}, - DEFAULT_OPTIMIZATION_CONFIG, - ) - - if self.auto_optimization is not None: - self.auto_optimization_config = OmegaConf.merge( - self.auto_optimization_config or {}, - DEFAULT_OPTIMIZATION_CONFIG, - ) - self.auto_optimization_config.pop("optimization_level", None) - self.auto_optimization_config[ - "for_gpu" - ] = self.auto_optimization_config.pop("optimize_for_gpu") - - if self.quantization: - self.quantization_config = OmegaConf.merge( - self.quantization_config or {}, - DEFAULT_QUANTIZATION_CONFIG, - ) - - # auto quantization is needs specific config for each type - # if self.auto_quantization is not None: - # self.auto_quantization_config = OmegaConf.merge( - # self.auto_quantization_config or {}, - # DEFAULT_QUANTIZATION_CONFIG, - # ) - - if self.quantization_config is not None: - self.calibration = self.quantization_config["is_static"] - - if self.auto_quantization_config is not None: - self.calibration = self.auto_quantization_config["is_static"] - - if self.calibration: - self.calibration_config = OmegaConf.merge( - self.calibration_config or {}, - DEFAULT_CALIBRATION_CONFIG, - ) - - if self.device_id is not None: - LOGGER.warning( - "device_id is deprecated, please use provider_options instead" - ) - self.provider_options = OmegaConf.merge( - self.provider_options or {}, - {"device_id": self.device_id}, - ) - - if self.enable_profiling is not None: - LOGGER.warning( - "enable_profiling is deprecated, please use session_options instead" - ) - self.session_options = OmegaConf.merge( - self.session_options or {}, - {"enable_profiling": self.enable_profiling}, - ) - - -class ORTBackend(Backend): - name: str = "onnxruntime" - config: ORTConfig - - def __init__( - self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any] - ) -> None: - super().__init__(model, task, device, hub_kwargs) - self.device = torch.device(device) - - if self.task == "stable-diffusion": - self.ortmodel_class = get_class( - "optimum.onnxruntime.ORTStableDiffusionPipeline" - ) - elif self.task == "stable-diffusion-xl": - self.ortmodel_class = get_class( - "optimum.onnxruntime.ORTStableDiffusionXLPipeline" - ) - elif self.task in ORT_SUPPORTED_TASKS: - self.ortmodel_class = ORT_SUPPORTED_TASKS[self.task]["class"][0] - else: - raise ValueError(f"Unsupported task {self.task}") - - LOGGER.info( - f"\t+ Infered ORTModel class {self.ortmodel_class.__name__} " - f"for task {self.task} and model_type {self.model_type}" - ) - - def configure(self, config: ORTConfig) -> None: - super().configure(config) - - # session options - session_options = SessionOptions() - if self.config.intra_op_num_threads is not None: - LOGGER.info( - f"\t+ Setting intra_op_num_threads({config.intra_op_num_threads})" - ) - self.config.session_options.intra_op_num_threads = ( - self.config.intra_op_num_threads - ) - if self.config.inter_op_num_threads is not None: - LOGGER.info( - f"\t+ Setting inter_op_num_threads({config.inter_op_num_threads})" - ) - self.config.session_options.inter_op_num_threads = ( - self.config.inter_op_num_threads - ) - for key, value in self.config.session_options.items(): - setattr(session_options, key, value) - self.config.session_options = session_options - - # Set torch dtype - self.config.torch_dtype = ( - getattr(torch, self.config.torch_dtype) # in case of torch.dtype - if self.config.torch_dtype is not None - and hasattr(torch, self.config.torch_dtype) - else self.config.torch_dtype - ) - - with TemporaryDirectory() as tmpdirname: - if self.config.use_ortmodel: - if self.config.no_weights: - self.load_ortmodel_from_config(tmpdirname) - else: - self.load_ortmodel_from_pretrained(tmpdirname) - else: - if self.config.no_weights: - self.load_automodel_from_config() - else: - self.load_automodel_from_pretrained() - - def load_ortmodel_from_config(self, tmpdirname: str) -> None: - LOGGER.info("\t+ Creating random weights model") - self.load_automodel_from_config() - - LOGGER.info("\t+ Exporting model to onnx") - main_export( - model_name_or_path=self.model, - output=f"{tmpdirname}/exported_model", - # with "auto" the taks manager will infer the same task - # we're using but will add "-with-past" when possible - task="auto", - device=self.device.type, - fp16=self.config.torch_dtype == torch.float16, - optimize=self.config.auto_optimization, - no_post_process=not self.config.use_merged, - do_validation=False, - **self.hub_kwargs, - # we hijack the model instantiation and use our random weights model - model=self.pretrained_model, - ) - self.delete_pretrained_model() - - LOGGER.info("\t+ Loading exported model with ORTModel") - self.pretrained_model = self.ortmodel_class.from_pretrained( - model_id=f"{tmpdirname}/exported_model", - session_options=self.config.session_options, - use_io_binding=self.config.use_io_binding, - provider=self.config.provider, - provider_options=self.config.provider_options, - **( - { - "use_merged": self.config.use_merged, - "use_cache": self.config.use_cache, - } - if self.is_text_generation_model() - else {} - ), - export=False, - **self.hub_kwargs, - ) - - if self.config.optimization: - raise NotImplementedError( - "Only AutoOptimization is supported when " - "loading a model with random weights" - ) - - if self.config.quantization or self.config.auto_quantization is not None: - self.quantize(tmpdirname) - - def load_ortmodel_from_pretrained(self, tmpdirname: str) -> None: - if ( - self.config.torch_dtype is not None - and self.config.torch_dtype != torch.float32 - ): - raise NotImplementedError( - "Loading with ORTModel is only supported " - "with torch_dtype float32 for now" - ) - - self.pretrained_model = self.ortmodel_class.from_pretrained( - model_id=self.model, - session_options=self.config.session_options, - use_io_binding=self.config.use_io_binding, - provider=self.config.provider, - provider_options=self.config.provider_options, - export=self.config.export, - **( - { - "use_merged": self.config.use_merged, - "use_cache": self.config.use_cache, - } - if self.is_text_generation_model() - else {} - ), - **self.hub_kwargs, - ) - - if self.config.optimization or self.config.auto_optimization is not None: - self.optimize(tmpdirname) - - if self.config.quantization or self.config.auto_quantization is not None: - self.quantize(tmpdirname) - - def optimize(self, tmpdirname: str) -> None: - if self.config.auto_optimization is not None: - LOGGER.info(f"\t+ Using auto optimization {self.config.auto_optimization}") - optimization_dict = OmegaConf.to_container( - self.config.auto_optimization_config, resolve=True - ) - LOGGER.info("\t+ Setting auto optimization parameters:") - for key, value in optimization_dict.items(): # type: ignore - LOGGER.info(f"\t\t+ {key}: {value}") - - optimization_config = AutoOptimizationConfig.with_optimization_level( - optimization_level=self.config.auto_optimization, **optimization_dict - ) - else: - optimization_dict = OmegaConf.to_container( - self.config.optimization_config, resolve=True - ) - LOGGER.info("\t+ Setting optimization parameters:") - for key, value in optimization_dict.items(): # type: ignore - LOGGER.info(f"\t\t+ {key}: {value}") - optimization_config = OptimizationConfig(**optimization_dict) - - LOGGER.info("\t+ Attempting optimization") - optimizer = ORTOptimizer.from_pretrained(self.pretrained_model) - optimizer.optimize( - save_dir=f"{tmpdirname}/optimized", - optimization_config=optimization_config, - ) - self.delete_pretrained_model() - - LOGGER.info("\t+ Loading optimized model") - self.pretrained_model = self.ortmodel_class.from_pretrained( - model_id=f"{tmpdirname}/optimized", - session_options=self.config.session_options, - use_io_binding=self.config.use_io_binding, - provider=self.config.provider, - provider_options=self.config.provider_options, - ) - - def quantize(self, tmpdirname: str) -> None: - if self.config.auto_quantization is not None: - LOGGER.info(f"\t+ Using auto quantization {self.config.auto_quantization}") - auto_quantization_config_class = getattr( - AutoQuantizationConfig, self.config.auto_quantization - ) - quantization_dict = OmegaConf.to_container( - self.config.auto_quantization_config, resolve=True - ) - quantization_dict = format_ort_quantization_dict(quantization_dict) - quantization_config = auto_quantization_config_class(**quantization_dict) - - else: - LOGGER.info("\t+ Using manual quantization") - quantization_dict = OmegaConf.to_container( - self.config.quantization_config, resolve=True - ) - quantization_dict = format_ort_quantization_dict(quantization_dict) - quantization_config = QuantizationConfig(**quantization_dict) - - LOGGER.info("\t+ Attempting quantization") - model_dir = self.pretrained_model.model_save_dir - components = [file for file in os.listdir(model_dir) if file.endswith(".onnx")] - for component in components: - LOGGER.info(f"\t+ Quantizing {component}") - quantizer = ORTQuantizer.from_pretrained(model_dir, file_name=component) - - if self.config.calibration: - preprocess_class = get_class( - self.config.calibration_config.preprocess_class - ) - preprocess_function = preprocess_class(model_name_or_path=self.model) - - calibration_dataset = quantizer.get_calibration_dataset( - dataset_name=self.config.calibration_config.dataset_name, - num_samples=self.config.calibration_config.num_samples, - dataset_config_name=self.config.calibration_config.dataset_config_name, - dataset_split=self.config.calibration_config.dataset_split, - preprocess_function=preprocess_function, - ) - - # Create the calibration configuration - # containing the parameters related to calibration. - calibration_config = AutoCalibrationConfig.minmax(calibration_dataset) - - # Perform the calibration step: - # computes the activations quantization ranges - calibration_tensors_range = quantizer.fit( - dataset=calibration_dataset, - calibration_config=calibration_config, - operators_to_quantize=quantization_config.operators_to_quantize, - ) - - quantizer.quantize( - save_dir=f"{tmpdirname}/quantized", - calibration_tensors_range=calibration_tensors_range, - quantization_config=quantization_config, - ) - self.delete_pretrained_model() - - LOGGER.info("\t+ Loading quantized model") - self.pretrained_model = self.ortmodel_class.from_pretrained( - model_id=f"{tmpdirname}/quantized", - session_options=self.config.session_options, - use_io_binding=self.config.use_io_binding, - provider=self.config.provider, - provider_options=self.config.provider_options, - ) - - def load_automodel_from_config(self) -> None: - with init_empty_weights(): - self.pretrained_model = self.automodel_class.from_config( - config=self.pretrained_config, - torch_dtype=self.config.torch_dtype, - trust_remote_code=self.hub_kwargs.get("trust_remote_code", False), - ) - self.pretrained_model.to_empty(device=self.device) - randomize_weights(self.pretrained_model) - - def load_automodel_from_pretrained(self) -> None: - with self.device: - self.pretrained_model = self.automodel_class.from_pretrained( - pretrained_model_name_or_path=self.model, - torch_dtype=self.config.torch_dtype, - **self.hub_kwargs, - ) - - def prepare_for_profiling(self, input_names: List[str]) -> None: - LOGGER.info("Preparing model for profiling") - LOGGER.info("\t+ Wrapping model inside profiler") - self.pretrained_model = ORTProfilingWrapper(self.pretrained_model) - - def forward(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput": - output = self.pretrained_model(**input, **kwargs) - - return output - - def generate(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput": - output = self.pretrained_model.generate(**input, **kwargs) - return output - - def train( - self, - training_dataset: "Dataset", - training_arguments: Dict[str, Any], - training_callbacks: List["TrainerCallback"], - training_data_collator: Callable, - ) -> "TrainerState": - LOGGER.info("\t+ Setting dataset format to `torch`.") - training_dataset.set_format( - type="torch", columns=list(training_dataset.features.keys()) - ) - - LOGGER.info( - "\t+ Wrapping training arguments with " - "optimum.onnxruntime.ORTTrainingArguments" - ) - training_arguments = ORTTrainingArguments(**training_arguments) - - LOGGER.info("\t+ Wrapping model with optimum.onnxruntime.ORTTrainer") - trainer = ORTTrainer( - model=self.pretrained_model, - args=training_arguments, - callbacks=training_callbacks, - train_dataset=training_dataset, - data_collator=training_data_collator, - ) - - LOGGER.info("\t+ Starting training") - trainer.train() - LOGGER.info("\t+ Training finished successfully") - trainer_state = trainer.state - - return trainer_state diff --git a/optimum_benchmark/backends/onnxruntime/__init__.py b/optimum_benchmark/backends/onnxruntime/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/optimum_benchmark/backends/onnxruntime/backend.py b/optimum_benchmark/backends/onnxruntime/backend.py new file mode 100644 index 000000000..a77ad8ad9 --- /dev/null +++ b/optimum_benchmark/backends/onnxruntime/backend.py @@ -0,0 +1,332 @@ +import os +from logging import getLogger +from tempfile import TemporaryDirectory +from typing import TYPE_CHECKING, Any, Callable, Dict, List + +import torch +from accelerate import init_empty_weights +from hydra.utils import get_class +from onnxruntime import SessionOptions +from optimum.onnxruntime import ( + ONNX_DECODER_NAME, + ONNX_DECODER_WITH_PAST_NAME, + ORTOptimizer, + ORTQuantizer, + ORTTrainer, + ORTTrainingArguments, +) +from optimum.onnxruntime.configuration import ( + AutoCalibrationConfig, + AutoOptimizationConfig, + AutoQuantizationConfig, + OptimizationConfig, + QuantizationConfig, +) + +if TYPE_CHECKING: + from datasets import Dataset + from transformers import TrainerCallback, TrainerState + +from ...profilers.ort_profiler import ORTProfilingWrapper +from ..base import Backend +from ..optimum_utils import main_export +from ..pytorch.utils import randomize_weights +from .config import ORTConfig +from .utils import TASKS_TO_ORTMODELS, TASKS_TO_ORTSD, format_quantization_config + +LOGGER = getLogger("onnxruntime") + + +class ORTBackend(Backend[ORTConfig]): + NAME: str = "onnxruntime" + + def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any]) -> None: + super().__init__(model, task, device, hub_kwargs) + self.validate_device() + self.validate_task() + + if self.is_diffusion_pipeline(): + self.ortmodel_class = get_class(TASKS_TO_ORTSD[self.task]) + elif self.task in TASKS_TO_ORTMODELS: + self.ortmodel_class = TASKS_TO_ORTMODELS[self.task] + + ortmodel_name = self.ortmodel_class.__name__ + LOGGER.info( + f"\t+ Infered ORTModel class {ortmodel_name} for task {self.task} and model_type {self.model_type}" + ) + + def validate_device(self) -> None: + if self.device.type not in ["cpu", "cuda"]: + raise ValueError(f"ORTBackend only supports CPU and CUDA devices, got {self.device.type}") + + def validate_task(self) -> None: + if self.task not in TASKS_TO_ORTMODELS and self.task not in TASKS_TO_ORTSD: + raise NotImplementedError(f"ORTBackend does not support task {self.task}") + + def configure(self, config: ORTConfig) -> None: + super().configure(config) + + # Process torch dtype + self.torch_dtype = getattr(torch, self.config.torch_dtype) if self.config.torch_dtype is not None else None + + ###### Training with ORTModule ###### + # ort-training is basically a different package so we might need to seperate these two backends in the future + if not self.config.use_inference_session: + if self.config.no_weights: + self.load_automodel_from_config() + else: + self.load_automodel_from_pretrained() + return + + ###### Inference with ORTModelForxxx ###### + # Inference session options + self.session_options = SessionOptions() + for key, value in self.config.session_options.items(): + setattr(self.session_options, key, value) + + # Exporting, optimizing, post-processing and quantizing with ORTModelForxxx + self.tmpdir = TemporaryDirectory() + + # Some statefullness to handle the different combinations of options + self.export = self.config.export + self.use_merged = self.config.use_merged + + if self.is_diffusion_pipeline(): + self.load_ortmodel() + # early exit because nothing of the following can be applied to diffusion pipelines + return + + if self.config.no_weights: + self.load_automodel_from_config() # creates dummy automodel + self.export_automodel() # exports automodel + self.export = False + else: + if self.config.export: + self.use_merged = False # merging is handeled seperately + self.load_automodel_from_pretrained() # creates automodel from pretrained + self.export_automodel() # exports automodel + self.export = False + + self.delete_pretrained_model() # deletes automodel + + if self.config.auto_optimization or self.config.optimization: + self.optimize_onnx_files() + + if self.config.use_merged: + self.merge_onnx_files() + self.use_merged = True + + if self.config.auto_quantization or self.config.quantization: + self.quantize_onnx_files() + + self.load_ortmodel() + self.tmpdir.cleanup() + + def load_automodel_from_config(self) -> None: + LOGGER.info("\t+ Loading AutoModel from config") + with init_empty_weights(): + self.pretrained_model = self.automodel_class.from_config( + self.pretrained_config, + torch_dtype=self.torch_dtype, + trust_remote_code=self.hub_kwargs.get("trust_remote_code", False), + ) + self.pretrained_model.to_empty(device=self.device) + randomize_weights(self.pretrained_model) + + def load_automodel_from_pretrained(self) -> None: + LOGGER.info("\t+ Loading AutoModel from pretrained") + with self.device: + self.pretrained_model = self.automodel_class.from_pretrained( + self.model, + torch_dtype=self.torch_dtype, + **self.hub_kwargs, + ) + + def load_ortmodel(self) -> None: + LOGGER.info("\t+ Loading ORTModel") + self.pretrained_model = self.ortmodel_class.from_pretrained( + self.model, + export=self.export, + provider=self.config.provider, + session_options=self.session_options, + use_io_binding=self.config.use_io_binding, + provider_options=self.config.provider_options, + **self.ortmodel_kwargs, + **self.hub_kwargs, + ) + # exported or not, the onnx model is/was here + self.model = self.pretrained_model.model_save_dir + + @property + def ortmodel_kwargs(self) -> Dict[str, Any]: + if self.is_text_generation_model(): + return {"use_cache": self.config.use_cache, "use_merged": self.use_merged} + else: + return {} + + @property + def true_task(self) -> str: + return self.task + "-with-past" if self.config.use_cache and self.is_text_generation_model() else self.task + + def export_automodel(self) -> None: + LOGGER.info("\t+ Exporting AutoModel to ONNX") + exported_model_dir = f"{self.tmpdir.name}/exported_model" + self.merging_config, self.models_and_onnx_configs = main_export( + self.model, + output=exported_model_dir, + task=self.true_task, + device=self.device.type, + fp16=self.torch_dtype == torch.float16, + **self.hub_kwargs, + # we hijack the model instantiation and use our random weights model + model=self.pretrained_model, + ) + self.model = exported_model_dir + + def merge_onnx_files(self) -> None: + LOGGER.info("\t+ Post-processing the exported model") + self.merging_config.post_process_exported_models(self.model, self.models_and_onnx_configs, None) + + @property + def onnx_files_names(self): + assert os.path.isdir(self.model), f"{self.model} is not a directory" + return [file for file in os.listdir(self.model) if file.endswith(".onnx")] + + def optimize_onnx_files(self) -> None: + LOGGER.info("\t+ Attempting optimization") + optimized_model_path = f"{self.tmpdir.name}/optimized" + LOGGER.info("\t+ Processing optimization config") + if self.config.auto_optimization is not None: + optimization_config = AutoOptimizationConfig.with_optimization_level( + optimization_level=self.config.auto_optimization, **self.config.auto_optimization_config + ) + elif self.config.optimization: + optimization_config = OptimizationConfig(**self.config.optimization_config) + LOGGER.info("\t+ Creating optimizer") + optimizer = ORTOptimizer.from_pretrained(self.model, file_names=self.onnx_files_names) + LOGGER.info("\t+ Optimizing ORTModel") + optimizer.optimize( + optimization_config, + save_dir=optimized_model_path, + file_suffix="", + # TODO: add support for these + use_external_data_format=None, + one_external_file=True, + ) + self.model = optimized_model_path + + @property + def onnx_files_names_to_quantize(self): + assert os.path.isdir(self.model), f"{self.model} is not a directory" + if self.config.use_merged: + # we filter merging components since they're not used for inference + # this also allows for calibration of one merged component models (like gpt2) + return [ + model + for model in self.onnx_files_names + if model not in [ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME] + ] + else: + return self.onnx_files_names + + def quantize_onnx_files(self) -> None: + LOGGER.info("\t+ Attempting quantization") + quantized_model_path = f"{self.tmpdir.name}/quantized" + LOGGER.info("\t+ Processing quantization config") + if self.config.calibration and len(self.onnx_files_names_to_quantize) > 1: + raise NotImplementedError("Calibration is not supported for models with multiple components") + if self.config.auto_quantization is not None: + self.config.auto_quantization_config = format_quantization_config(self.config.auto_quantization_config) + auto_quantization_config_class = getattr(AutoQuantizationConfig, self.config.auto_quantization) + quantization_config = auto_quantization_config_class(**self.config.auto_quantization_config) + elif self.config.quantization: + self.config.quantization_config = format_quantization_config(self.config.quantization_config) + quantization_config = QuantizationConfig(**self.config.quantization_config) + LOGGER.info(f"\t+ Model has {len(self.onnx_files_names_to_quantize)} components to quantize") + if len(self.onnx_files_names_to_quantize) == 1: + LOGGER.info("\t+ Creating quantizer") + quantizer = ORTQuantizer.from_pretrained(self.model, file_name=self.onnx_files_names_to_quantize[0]) + if self.config.calibration: + LOGGER.info("\t+ Processing calibration config") + preprocess_class = get_class(self.config.calibration_config.pop("preprocess_class")) + self.config.calibration_config["preprocess_function"] = preprocess_class(model_name_or_path=self.model) + LOGGER.info("\t+ Loading calibration dataset") + calibration_dataset = quantizer.get_calibration_dataset(**self.config.calibration_config) + LOGGER.info("\t+ Creating calibration config") + calibration_config = AutoCalibrationConfig.minmax(calibration_dataset) + LOGGER.info("\t+ Fitting calibration tensors range") + calibration_tensors_range = quantizer.fit( + dataset=calibration_dataset, + calibration_config=calibration_config, + operators_to_quantize=quantization_config.operators_to_quantize, + use_gpu=self.device.type == "cuda", + # TODO: add support for these + batch_size=1, + use_external_data_format=False, + force_symmetric_range=False, + ) + else: + calibration_tensors_range = None + LOGGER.info("\t+ Quantizing model") + quantizer.quantize( + save_dir=quantized_model_path, + quantization_config=quantization_config, + calibration_tensors_range=calibration_tensors_range, + # TODO: add support for these + use_external_data_format=False, + preprocessor=None, + ) + else: + for onnx_file_name_to_quantize in self.onnx_files_names_to_quantize: + LOGGER.info(f"\t+ Creating quantizer for {onnx_file_name_to_quantize}") + quantizer = ORTQuantizer.from_pretrained(self.model, file_name=onnx_file_name_to_quantize) + LOGGER.info(f"\t+ Quantizing {onnx_file_name_to_quantize}") + quantizer.quantize( + save_dir=quantized_model_path, + quantization_config=quantization_config, + calibration_tensors_range=None, + file_suffix="", + # TODO: add support for these + use_external_data_format=False, + preprocessor=None, + ) + self.model = quantized_model_path + + def prepare_for_profiling(self, input_names: List[str]) -> None: + LOGGER.info("Preparing model for profiling") + LOGGER.info("\t+ Wrapping model inside profiler") + self.pretrained_model = ORTProfilingWrapper(self.pretrained_model) + + def train( + self, + training_dataset: "Dataset", + training_data_collator: Callable, + training_arguments: Dict[str, Any], + training_callbacks: List["TrainerCallback"], + ) -> "TrainerState": + LOGGER.info("\t+ Setting dataset format to `torch`") + training_dataset.set_format(type="torch", columns=list(training_dataset.features.keys())) + LOGGER.info("\t+ Wrapping training arguments with optimum.onnxruntime.ORTTrainingArguments") + training_arguments = ORTTrainingArguments(**training_arguments) + LOGGER.info("\t+ Wrapping model with optimum.onnxruntime.ORTTrainer") + trainer = ORTTrainer( + model=self.pretrained_model, + feature=self.task, + args=training_arguments, + data_collator=training_data_collator, + train_dataset=training_dataset, + callbacks=training_callbacks, + # TODO: add support for optimizers + optimizers=(None, None), + ) + LOGGER.info("\t+ Launching training") + trainer.train() + LOGGER.info("\t+ Training finished successfully") + trainer_state = trainer.state + + return trainer_state + + def clean(self) -> None: + super().clean() + if hasattr(self, "tmpdir"): + self.tmpdir.cleanup() diff --git a/optimum_benchmark/backends/onnxruntime/config.py b/optimum_benchmark/backends/onnxruntime/config.py new file mode 100644 index 000000000..decf5d482 --- /dev/null +++ b/optimum_benchmark/backends/onnxruntime/config.py @@ -0,0 +1,185 @@ +import importlib.metadata +import importlib.util +from dataclasses import dataclass, field +from typing import Any, Dict, Optional + +from omegaconf import OmegaConf + +from ..base import BackendConfig +from .utils import infer_device_id + + +def onnxruntime_version(): + try: + return "ort:" + importlib.metadata.version("onnxruntime") + except importlib.metadata.PackageNotFoundError: + try: + return "ort-gpu:" + importlib.metadata.version("onnxruntime-gpu") + except importlib.metadata.PackageNotFoundError: + return "ort:unknown" + + +OmegaConf.register_new_resolver( + "is_gpu", + lambda device: "cuda" in device.lower(), +) +OmegaConf.register_new_resolver( + "is_profiling", + lambda benchmark_name: benchmark_name == "profiling", +) +OmegaConf.register_new_resolver( + "infer_provider", + lambda device: "CPUExecutionProvider" if device == "cpu" else "CUDAExecutionProvider", +) +OmegaConf.register_new_resolver( + "infer_device_id", + lambda device: infer_device_id(device), +) +OmegaConf.register_new_resolver( + "onnxruntime_version", + lambda: onnxruntime_version(), +) + +OPTIMIZATION_CONFIG = { + "optimization_level": 1, # 0, 1, 2, 99 + "optimize_for_gpu": "${is_gpu:${device}}", + "fp16": False, + "enable_transformers_specific_optimizations": True, + "enable_gelu_approximation": False, + "disable_gelu_fusion": False, + "disable_layer_norm_fusion": False, + "disable_attention_fusion": False, + "disable_skip_layer_norm_fusion": True, + "disable_bias_skip_layer_norm_fusion": False, + "disable_bias_gelu_fusion": False, + "use_mask_index": False, + "no_attention_mask": False, + "disable_embed_layer_norm_fusion": True, + "disable_shape_inference": False, + "use_multi_head_attention": False, + "enable_gemm_fast_gelu_fusion": False, + "use_raw_attention_mask": False, + "disable_group_norm_fusion": True, + "disable_packed_kv": True, +} + +AUTO_OPTIMIZATION_CONFIG = { + "for_gpu": "${is_gpu:${device}}", + # full auto optimization config depends on the level so we keep it minimal +} + +QUANTIZATION_CONFIG = { + "is_static": False, + "format": "QOperator", # QOperator, QDQ + "mode": "IntegerOps", # QLinearOps, IntegerOps + "activations_dtype": "QUInt8", # QInt8, QUInt8 + "activations_symmetric": False, + "weights_dtype": "QInt8", # QInt8, QUInt8 + "weights_symmetric": True, + "per_channel": False, + "reduce_range": False, + "operators_to_quantize": [ + "MatMul", + "Add", + ], +} + +AUTO_QUANTIZATION_CONFIG = { + "is_static": False, + # full auto quantization config depends on the strategy so we keep it minimal +} + +CALIBRATION_CONFIG = { + "dataset_name": "glue", + "num_samples": 300, + "dataset_config_name": "sst2", + "dataset_split": "train", + "preprocess_batch": True, + "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor", +} +PROVIDER_OPTIONS = {"device_id": "${infer_device_id:${device}}"} +SESSION_OPTIONS = {"enable_profiling": "${is_profiling:${benchmark.name}}"} + + +@dataclass +class ORTConfig(BackendConfig): + name: str = "onnxruntime" + version: str = "${onnxruntime_version:}" + _target_: str = "optimum_benchmark.backends.onnxruntime.backend.ORTBackend" + + no_weights: bool = False + + # export options + export: bool = True + use_cache: bool = True + use_merged: bool = False + torch_dtype: Optional[str] = None + + # provider options + provider: str = "${infer_provider:${device}}" + device_id: Optional[int] = "${oc.deprecated:backend.provider_options.device_id}" + provider_options: Dict[str, Any] = field(default_factory=lambda: PROVIDER_OPTIONS) + + # inference options + use_io_binding: bool = "${is_gpu:${device}}" + enable_profiling: bool = "${oc.deprecated:backend.session_options.enable_profiling}" + session_options: Dict[str, Any] = field(default_factory=lambda: SESSION_OPTIONS) + + # optimization options + optimization: bool = False + optimization_config: Dict[str, Any] = field(default_factory=dict) + + # quantization options + quantization: bool = False + quantization_config: Dict[str, Any] = field(default_factory=dict) + + # calibration options + calibration: bool = False + calibration_config: Dict[str, Any] = field(default_factory=dict) + + # null, O1, O2, O3, O4 + auto_optimization: Optional[str] = None + auto_optimization_config: Dict[str, Any] = field(default_factory=dict) + + # null, arm64, avx2, avx512, avx512_vnni, tensorrt + auto_quantization: Optional[str] = None + auto_quantization_config: Dict[str, Any] = field(default_factory=dict) + + # ort-training is basically a different package so we might need to seperate these two backends in the future + use_inference_session: bool = "${is_inference:${benchmark.name}}" + + def __post_init__(self): + if not self.no_weights and not self.export and self.torch_dtype is not None: + raise NotImplementedError("Can't convert an exported model's weights to a different dtype.") + + if self.optimization: + self.optimization_config = OmegaConf.to_container( + OmegaConf.merge(OPTIMIZATION_CONFIG, self.optimization_config) + ) + if self.quantization: + self.quantization_config = OmegaConf.to_container( + OmegaConf.merge(QUANTIZATION_CONFIG, self.quantization_config) + ) + # raise ValueError if the quantization is static but calibration is not enabled + if self.quantization_config["is_static"] and not self.calibration: + raise ValueError( + "Quantization is static but calibration is not enabled. Please enable calibration or disable static quantization." + ) + + if self.auto_optimization is not None: + self.auto_optimization_config = OmegaConf.to_container( + OmegaConf.merge(AUTO_OPTIMIZATION_CONFIG, self.auto_optimization_config) + ) + if self.auto_quantization is not None: + self.auto_quantization_config = OmegaConf.to_container( + OmegaConf.merge(AUTO_QUANTIZATION_CONFIG, self.auto_quantization_config) + ) + if self.auto_quantization_config["is_static"] and not self.calibration: + raise ValueError( + "Quantization is static but calibration is not enabled. Please enable calibration or disable static quantization." + ) + + if self.calibration: + self.calibration_config = OmegaConf.to_container( + OmegaConf.merge(CALIBRATION_CONFIG, self.calibration_config) + ) diff --git a/optimum_benchmark/backends/onnxruntime/utils.py b/optimum_benchmark/backends/onnxruntime/utils.py new file mode 100644 index 000000000..be63fef8a --- /dev/null +++ b/optimum_benchmark/backends/onnxruntime/utils.py @@ -0,0 +1,40 @@ +from typing import Any, Dict + +from onnxruntime.quantization import QuantFormat, QuantizationMode, QuantType +from optimum.pipelines import ORT_SUPPORTED_TASKS + +TASKS_TO_ORTSD = { + "stable-diffusion": "optimum.onnxruntime.ORTStableDiffusionPipeline", + "stzble-diffusion-xl": "optimum.onnxruntime.ORTStableDiffusionXLPipeline", +} + +TASKS_TO_ORTMODELS = {task: task_dict["class"][0] for task, task_dict in ORT_SUPPORTED_TASKS.items()} + + +def infer_device_id(device: str) -> int: + """Infer the device id from the given device string.""" + if device == "cuda": + # torch.cuda.current_device() will always return 0 + # unless torch.cuda.set_device() is called somewhere + return 0 + elif "cuda" in device: + return int(device.split(":")[1]) + elif device == "cpu": + return -1 + else: + raise ValueError(f"Unknown device: {device}") + + +def format_quantization_config(quantization_config: Dict[str, Any]) -> None: + """Format the quantization dictionary for onnxruntime.""" + # the conditionals are here because some quantization strategies don't have all the options + if quantization_config.get("format", None) is not None: + quantization_config["format"] = QuantFormat.from_string(quantization_config["format"]) + if quantization_config.get("mode", None) is not None: + quantization_config["mode"] = QuantizationMode.from_string(quantization_config["mode"]) + if quantization_config.get("activations_dtype", None) is not None: + quantization_config["activations_dtype"] = QuantType.from_string(quantization_config["activations_dtype"]) + if quantization_config.get("weights_dtype", None) is not None: + quantization_config["weights_dtype"] = QuantType.from_string(quantization_config["weights_dtype"]) + + return quantization_config diff --git a/optimum_benchmark/backends/openvino.py b/optimum_benchmark/backends/openvino.py deleted file mode 100644 index 6e83ed756..000000000 --- a/optimum_benchmark/backends/openvino.py +++ /dev/null @@ -1,190 +0,0 @@ -from typing import Dict, Optional, Any, TYPE_CHECKING -from tempfile import TemporaryDirectory -from dataclasses import dataclass -from logging import getLogger - - -import torch -import inspect -from torch import Tensor -from omegaconf import OmegaConf -from hydra.utils import get_class -from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS -from openvino.runtime import __version__ as openvino_version -from optimum.intel import OVConfig as OVQuantizationConfig, OVQuantizer - -if TYPE_CHECKING: - from transformers.modeling_outputs import ModelOutput - - -from .base import Backend, BackendConfig -from .utils.openvino_utils import ( - DEFAULT_QUANTIZATION_CONFIG, - DEFAULT_CALIBRATION_CONFIG, -) - - -LOGGER = getLogger("openvino") - - -@dataclass -class OVConfig(BackendConfig): - name: str = "openvino" - version: str = openvino_version - _target_: str = "optimum_benchmark.backends.openvino.OVBackend" - - # export options - export: bool = True - no_weights: bool = False - use_merged: Optional[bool] = None - torch_dtype: Optional[str] = None - - # compiling options - reshape: bool = False - half: bool = False - - # quantization options - quantization: bool = False - quantization_config: Optional[Dict[str, Any]] = None - - # calibration options - calibration: bool = True - calibration_config: Optional[Dict[str, Any]] = None - - def __post_init__(self): - assert self.torch_dtype is None or self.torch_dtype == "float32", ( - "Only float32 is supported for torch_dtype in openvino backend. " - f"Got {self.torch_dtype}" - ) - - if self.quantization: - self.quantization_config = OmegaConf.merge( - self.quantization_config or {}, - DEFAULT_QUANTIZATION_CONFIG, - ) - - if self.calibration: - self.calibration_config = OmegaConf.merge( - self.calibration_config or {}, - DEFAULT_CALIBRATION_CONFIG, - ) - - -class OVBackend(Backend): - name: str = "openvino" - config: OVConfig - - def __init__( - self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any] - ) -> None: - super().__init__(model, task, device, hub_kwargs) - self.device = torch.device(device) - - self.ovmodel_class = get_class( - f"optimum.intel.openvino.{_HEAD_TO_AUTOMODELS[self.task]}" - ) - - LOGGER.info( - f"\t+ Infered OVModel class {self.ovmodel_class.__name__} " - f"for task {self.task} and model_type {self.model_type}" - ) - - def configure(self, config: OVConfig) -> None: - super().configure(config) - - # Set torch dtype - self.config.torch_dtype = ( - getattr(torch, self.config.torch_dtype) - if self.config.torch_dtype is not None - else None - ) - - if self.config.quantization: - self.config.quantization_config = OVQuantizationConfig( - **self.config.quantization_config, - ) - - with TemporaryDirectory() as tmpdirname: - if self.config.no_weights: - raise NotImplementedError( - "no_weights is not supported for openvino backend" - ) - else: - self.load_model_from_pretrained() - - if self.config.quantization: - self.quantize(tmpdirname) - - def load_model_from_pretrained(self) -> None: - self.pretrained_model = self.ovmodel_class.from_pretrained( - model_id=self.model, - use_merged=self.config.use_merged, - export=self.config.export, - **self.hub_kwargs, - ) - - def quantize(self, tmpdirname: str) -> None: - LOGGER.info("\t+ Attempting quantization") - - model = self.automodel_class.from_pretrained(self.model, **self.hub_kwargs) - quantizer = OVQuantizer.from_pretrained(model) - - preprocess_class = get_class(self.config.calibration_config.preprocess_class) - preprocess_function = preprocess_class(model_name_or_path=self.model) - - calibration_dataset = quantizer.get_calibration_dataset( - dataset_name=self.config.calibration_config.dataset_name, - num_samples=self.config.calibration_config.num_samples, - dataset_config_name=self.config.calibration_config.dataset_config_name, - dataset_split=self.config.calibration_config.dataset_split, - preprocess_function=preprocess_function, - ) - - quantizer.quantize( - calibration_dataset=calibration_dataset, - save_directory=f"{tmpdirname}/quantized", - quantization_config=self.config.quantization_config, - # defaults - batch_size=1, - data_collator=None, - remove_unused_columns=True, - weights_only=False, - ) - self.delete_pretrained_model() - - LOGGER.info("\t+ Loading quantized model") - self.pretrained_model = self.ovmodel_class.from_pretrained( - model_id=f"{tmpdirname}/quantized", - use_merged=self.config.use_merged, - ) - - def prepare_for_inference(self, input_shapes: Dict[str, int]) -> None: - if self.config.reshape: - static_shapes = { - key: value - for key, value in input_shapes.items() - if key in inspect.getfullargspec(self.pretrained_model.reshape).args - } - LOGGER.info(f"\t+ Reshaping model with static shapes: {static_shapes}") - self.pretrained_model.reshape(**static_shapes) - - if self.config.half: - LOGGER.info("\t+ Converting model to half precision") - self.pretrained_model.half() - - if self.config.reshape or self.config.half: - LOGGER.info("\t+ Compiling model") - self.pretrained_model.compile() - - def forward(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput": - output = self.pretrained_model(**input, **kwargs) - - return output - - def generate(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput": - output = self.pretrained_model.generate(**input, **kwargs) - - return output - - def train(self, **kwargs) -> None: - pass diff --git a/optimum_benchmark/backends/openvino/__init__.py b/optimum_benchmark/backends/openvino/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/optimum_benchmark/backends/openvino/backend.py b/optimum_benchmark/backends/openvino/backend.py new file mode 100644 index 000000000..28b354c54 --- /dev/null +++ b/optimum_benchmark/backends/openvino/backend.py @@ -0,0 +1,119 @@ +import inspect +from logging import getLogger +from tempfile import TemporaryDirectory +from typing import Any, Dict + +from hydra.utils import get_class +from optimum.intel.openvino import OVConfig as OVQuantizationConfig # naming conflict +from optimum.intel.openvino import OVQuantizer + +from ..base import Backend +from .config import OVConfig +from .utils import TASKS_TO_OVMODEL + +LOGGER = getLogger("openvino") + + +class OVBackend(Backend[OVConfig]): + NAME: str = "openvino" + + def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any]) -> None: + super().__init__(model, task, device, hub_kwargs) + self.validate_device() + self.validate_task() + + self.ovmodel_class = get_class(TASKS_TO_OVMODEL[self.task]) + ortmodel_name = self.ovmodel_class.__name__ + LOGGER.info(f"\t+ Infered OVModel class {ortmodel_name} for task {self.task} and model_type {self.model_type}") + + def validate_task(self) -> None: + if self.task not in TASKS_TO_OVMODEL: + raise NotImplementedError(f"OVBackend does not support task {self.task}") + + def validate_device(self) -> None: + if self.device.type != "cpu": + raise ValueError(f"OVBackend only supports CPU devices, got {self.device.type}") + + def configure(self, config: OVConfig) -> None: + super().configure(config) + + self.tmpdir = TemporaryDirectory() + + if self.config.quantization: + self.load_automodel() + self.quantize_automodel() + self.delete_pretrained_model() # deletes automodel + self.export = False # quantized model is already exported + else: + self.export = self.config.export # to not change the config's values + + self.load_ovmodel() + self.tmpdir.cleanup() + + def load_automodel(self) -> None: + self.pretrained_model = self.automodel_class.from_pretrained(self.model, **self.hub_kwargs) + + @property + def ovmodel_kwargs(self) -> Dict[str, Any]: + if self.is_text_generation_model(): + return {"use_cache": self.config.use_cache, "use_merged": self.config.use_merged} + else: + return {} + + def load_ovmodel(self) -> None: + self.pretrained_model = self.ovmodel_class.from_pretrained( + self.model, + export=self.export, + **self.ovmodel_kwargs, + **self.hub_kwargs, + ) + + def quantize_automodel(self) -> None: + LOGGER.info("\t+ Attempting quantization") + quantized_model_path = f"{self.tmpdir.name}/quantized" + LOGGER.info("\t+ Processing quantization config") + quantization_config = OVQuantizationConfig(**self.config.quantization_config) + LOGGER.info("\t+ Creating quantizer") + quantizer = OVQuantizer.from_pretrained(self.pretrained_model, task=self.task, seed=self.config.seed) + LOGGER.info("\t+ Processing calibration config") + calibration_config = self.config.calibration_config.copy() + preprocess_class = get_class(calibration_config.pop("preprocess_class")) + calibration_config["preprocess_function"] = preprocess_class(model_name_or_path=self.model) + LOGGER.info("\t+ Loading calibration dataset") + calibration_dataset = quantizer.get_calibration_dataset(**calibration_config) + LOGGER.info("\t+ Quantizing model") + quantizer.quantize( + quantization_config=quantization_config, + save_directory=quantized_model_path, + calibration_dataset=calibration_dataset, + # TODO: add support for these + remove_unused_columns=True, + data_collator=None, + weights_only=False, + file_name=None, + batch_size=1, + ) + self.model = quantized_model_path + + def prepare_for_inference(self, input_shapes: Dict[str, int]) -> None: + if self.config.reshape: + static_shapes = { + key: value + for key, value in input_shapes.items() + if key in inspect.getfullargspec(self.pretrained_model.reshape).args + } + LOGGER.info(f"\t+ Reshaping model with static shapes: {static_shapes}") + self.pretrained_model.reshape(**static_shapes) + + if self.config.half: + LOGGER.info("\t+ Converting model to half precision") + self.pretrained_model.half() + + if self.config.reshape or self.config.half: + LOGGER.info("\t+ Compiling model") + self.pretrained_model.compile() + + def clean(self) -> None: + super().clean() + if hasattr(self, "tmpdir"): + self.tmpdir.cleanup() diff --git a/optimum_benchmark/backends/openvino/config.py b/optimum_benchmark/backends/openvino/config.py new file mode 100644 index 000000000..e54c2aefd --- /dev/null +++ b/optimum_benchmark/backends/openvino/config.py @@ -0,0 +1,64 @@ +import importlib.metadata +from dataclasses import dataclass, field +from typing import Any, Dict + +from omegaconf import OmegaConf + +from ..base import BackendConfig + +OmegaConf.register_new_resolver( + "openvino_version", + lambda: importlib.metadata.version("openvino"), +) + +# https://github.com/huggingface/optimum-intel/blob/main/optimum/intel/openvino/configuration.py#L81 +QUANTIZATION_CONFIG = { + "compression": None, + "input_info": None, + "save_onnx_model": False, +} + +CALIBRATION_CONFIG = { + "dataset_name": "glue", + "num_samples": 300, + "dataset_config_name": "sst2", + "dataset_split": "train", + "preprocess_batch": True, + "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor", +} + + +@dataclass +class OVConfig(BackendConfig): + name: str = "openvino" + version: str = "${openvino_version:}" + _target_: str = "optimum_benchmark.backends.openvino.backend.OVBackend" + + # export options + export: bool = True + use_cache: bool = True + use_merged: bool = False + + # compiling options + reshape: bool = False + half: bool = False + + # quantization options + quantization: bool = False + quantization_config: Dict[str, Any] = field(default_factory=dict) + + # calibration options + calibration: bool = False + calibration_config: Dict[str, Any] = field(default_factory=dict) + + def __post_init__(self): + if self.quantization: + self.quantization_config = OmegaConf.to_container( + OmegaConf.merge(QUANTIZATION_CONFIG, self.quantization_config) + ) + if not self.calibration: + raise ValueError("OpenVINO quantization requires enabling calibration.") + else: + self.calibration_config = OmegaConf.to_container( + OmegaConf.merge(CALIBRATION_CONFIG, self.calibration_config) + ) diff --git a/optimum_benchmark/backends/openvino/utils.py b/optimum_benchmark/backends/openvino/utils.py new file mode 100644 index 000000000..4c13891e5 --- /dev/null +++ b/optimum_benchmark/backends/openvino/utils.py @@ -0,0 +1,3 @@ +from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS + +TASKS_TO_OVMODEL = {task: f"optimum.intel.openvino.{ovmodel}" for task, ovmodel in _HEAD_TO_AUTOMODELS.items()} diff --git a/optimum_benchmark/backends/utils/optimum_utils.py b/optimum_benchmark/backends/optimum_utils.py similarity index 67% rename from optimum_benchmark/backends/utils/optimum_utils.py rename to optimum_benchmark/backends/optimum_utils.py index a558f1659..a064cba08 100644 --- a/optimum_benchmark/backends/utils/optimum_utils.py +++ b/optimum_benchmark/backends/optimum_utils.py @@ -1,31 +1,29 @@ -from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union -from pathlib import Path import os +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union import torch from optimum.exporters.onnx.__main__ import ( - logger, - TasksManager, - OnnxConfigWithPast, - _get_submodels_and_onnx_configs, - maybe_save_preprocessors, - validate_models_outputs, - is_torch_available, - export_models, - AutoTokenizer, DEFAULT_DUMMY_SHAPES, ONNX_WEIGHTS_NAME, - UNPICKABLE_ARCHS, + # UNPICKABLE_ARCHS, + # AtolError, + AutoTokenizer, + OnnxConfigWithPast, + # OutputMatchError, RequestsConnectionError, - OutputMatchError, - ShapeError, - AtolError, + # ShapeError, + TasksManager, + _get_submodels_and_onnx_configs, + export_models, + is_torch_available, + logger, + maybe_save_preprocessors, ) - if TYPE_CHECKING: - from transformers import PreTrainedModel from optimum.exporters.onnx import OnnxConfig + from transformers import PreTrainedModel # rewrite of the main_export function from optimum.exporters.onnx.__main__ @@ -39,7 +37,7 @@ def main_export( fp16: Optional[bool] = False, optimize: Optional[str] = None, monolith: bool = False, - no_post_process: bool = False, + # no_post_process: bool = False, framework: Optional[str] = None, atol: Optional[float] = None, cache_dir: Optional[str] = None, @@ -51,11 +49,11 @@ def main_export( local_files_only: bool = False, use_auth_token: Optional[Union[bool, str]] = None, for_ort: bool = False, - do_validation: bool = True, + # do_validation: bool = True, model_kwargs: Optional[Dict[str, Any]] = None, custom_onnx_configs: Optional[Dict[str, "OnnxConfig"]] = None, fn_get_submodels: Optional[Callable] = None, - use_subprocess: bool = False, + # use_subprocess: bool = False, ######################################## model: Optional["PreTrainedModel"] = None, ######################################## @@ -88,17 +86,13 @@ def main_export( original_task = task task = TasksManager.map_from_synonym(task) - framework = TasksManager.determine_framework( - model_name_or_path, subfolder=subfolder, framework=framework - ) + framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework) # get the shapes to be used to generate dummy inputs input_shapes = {} for input_name in DEFAULT_DUMMY_SHAPES.keys(): input_shapes[input_name] = ( - kwargs_shapes[input_name] - if input_name in kwargs_shapes - else DEFAULT_DUMMY_SHAPES[input_name] + kwargs_shapes[input_name] if input_name in kwargs_shapes else DEFAULT_DUMMY_SHAPES[input_name] ) torch_dtype = None if fp16 is False else torch.float16 @@ -133,11 +127,7 @@ def main_export( custom_architecture = False is_stable_diffusion = "stable-diffusion" in task - model_type = ( - "stable-diffusion" - if is_stable_diffusion - else model.config.model_type.replace("_", "-") - ) + model_type = "stable-diffusion" if is_stable_diffusion else model.config.model_type.replace("_", "-") if not is_stable_diffusion: if model_type in TasksManager._UNSUPPORTED_CLI_MODEL_TYPE: @@ -145,9 +135,9 @@ def main_export( f"{model_type} is not supported yet. Only {TasksManager._SUPPORTED_CLI_MODEL_TYPE} are supported. " f"If you want to support {model_type} please propose a PR or open up an issue." ) - if model.config.model_type.replace( - "-", "_" - ) not in TasksManager.get_supported_model_type_for_task(task, exporter="onnx"): + if model.config.model_type.replace("-", "_") not in TasksManager.get_supported_model_type_for_task( + task, exporter="onnx" + ): custom_architecture = True # TODO: support onnx_config.py in the model repo @@ -164,12 +154,9 @@ def main_export( if ( not custom_architecture and not is_stable_diffusion - and task + "-with-past" - in TasksManager.get_supported_tasks_for_model_type(model_type, "onnx") + and task + "-with-past" in TasksManager.get_supported_tasks_for_model_type(model_type, "onnx") ): - if ( - original_task == "auto" - ): # Make -with-past the default if --task was not explicitely specified + if original_task == "auto": # Make -with-past the default if --task was not explicitely specified task = task + "-with-past" else: logger.info( @@ -197,9 +184,7 @@ def main_export( model=model, task=task, monolith=monolith, - custom_onnx_configs=custom_onnx_configs - if custom_onnx_configs is not None - else {}, + custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {}, custom_architecture=custom_architecture, fn_get_submodels=fn_get_submodels, ) @@ -257,15 +242,10 @@ def main_export( subcomponent = models_and_onnx_configs[model_name][0] if hasattr(subcomponent, "save_config"): subcomponent.save_config(output / model_name) - elif hasattr(subcomponent, "config") and hasattr( - subcomponent.config, "save_pretrained" - ): + elif hasattr(subcomponent, "config") and hasattr(subcomponent.config, "save_pretrained"): subcomponent.config.save_pretrained(output / model_name) - onnx_files_subpaths = [ - os.path.join(name_dir, ONNX_WEIGHTS_NAME) - for name_dir in models_and_onnx_configs - ] + onnx_files_subpaths = [os.path.join(name_dir, ONNX_WEIGHTS_NAME) for name_dir in models_and_onnx_configs] # Saving the additional components needed to perform inference. model.scheduler.save_pretrained(output.joinpath("scheduler")) @@ -294,77 +274,83 @@ def main_export( dtype="fp16" if fp16 is True else None, model_kwargs=model_kwargs, ) - - if optimize is not None: - from optimum.onnxruntime.configuration import AutoOptimizationConfig - from optimum.onnxruntime import ORTOptimizer - - if onnx_files_subpaths is None: - onnx_files_subpaths = [ - key + ".onnx" for key in models_and_onnx_configs.keys() - ] - optimizer = ORTOptimizer.from_pretrained(output, file_names=onnx_files_subpaths) - - optimization_config = AutoOptimizationConfig.with_optimization_level( - optimization_level=optimize - ) - - optimization_config.disable_shape_inference = True - optimizer.optimize( - save_dir=output, optimization_config=optimization_config, file_suffix="" - ) - - # Optionally post process the obtained ONNX file(s), for example to merge the decoder / decoder with past if any - # TODO: treating stable diffusion separately is quite ugly - if not no_post_process and not is_stable_diffusion: - try: - logger.info("Post-processing the exported models...") - ( - models_and_onnx_configs, - onnx_files_subpaths, - ) = onnx_config.post_process_exported_models( - output, models_and_onnx_configs, onnx_files_subpaths - ) - except Exception as e: - raise Exception( - f"The post-processing of the ONNX export failed. The export can still be performed by passing the option --no-post-process. Detailed error: {e}" - ) - - if is_stable_diffusion: - use_subprocess = False # TODO: fix Can't pickle local object 'get_stable_diffusion_models_for_export..' - elif model.config.model_type in UNPICKABLE_ARCHS: - # Pickling is bugged for nn.utils.weight_norm: https://github.com/pytorch/pytorch/issues/102983 - # TODO: fix "Cowardly refusing to serialize non-leaf tensor" error for wav2vec2-conformer - use_subprocess = False - - if do_validation is True: - try: - validate_models_outputs( - models_and_onnx_configs=models_and_onnx_configs, - onnx_named_outputs=onnx_outputs, - atol=atol, - output_dir=output, - onnx_files_subpaths=onnx_files_subpaths, - input_shapes=input_shapes, - device=device, - dtype=torch_dtype, - use_subprocess=use_subprocess, - model_kwargs=model_kwargs, - ) - logger.info( - f"The ONNX export succeeded and the exported model was saved at: {output.as_posix()}" - ) - except ShapeError as e: - raise e - except AtolError as e: - logger.warning( - f"The ONNX export succeeded with the warning: {e}.\n The exported model was saved at: {output.as_posix()}" - ) - except OutputMatchError as e: - logger.warning( - f"The ONNX export succeeded with the warning: {e}.\n The exported model was saved at: {output.as_posix()}" - ) - except Exception as e: - raise Exception( - f"An error occured during validation, but the model was saved nonetheless at {output.as_posix()}. Detailed error: {e}." - ) + # for the post processing later we don't wanna keep models + if len(models_and_onnx_configs) == 2: + models_and_onnx_configs = { + "decoder_model": ("dummy_decoder_model_object", models_and_onnx_configs["decoder_model"][1]), + "decoder_with_past_model": ( + "dummy_decoder_with_past_model_object", + models_and_onnx_configs["decoder_with_past_model"][1], + ), + } + else: + models_and_onnx_configs = { + "model": ("dummy_model", models_and_onnx_configs["model"][1]), + } + + return onnx_config, models_and_onnx_configs + + # if optimize is not None: + # from optimum.onnxruntime import ORTOptimizer + # from optimum.onnxruntime.configuration import AutoOptimizationConfig + + # if onnx_files_subpaths is None: + # onnx_files_subpaths = [key + ".onnx" for key in models_and_onnx_configs.keys()] + # optimizer = ORTOptimizer.from_pretrained(output, file_names=onnx_files_subpaths) + + # optimization_config = AutoOptimizationConfig.with_optimization_level(optimization_level=optimize) + + # optimization_config.disable_shape_inference = True + # optimizer.optimize(save_dir=output, optimization_config=optimization_config, file_suffix="") + + # # Optionally post process the obtained ONNX file(s), for example to merge the decoder / decoder with past if any + # # TODO: treating stable diffusion separately is quite ugly + # if not no_post_process and not is_stable_diffusion: + # try: + # logger.info("Post-processing the exported models...") + # (models_and_onnx_configs, onnx_files_subpaths) = onnx_config.post_process_exported_models( + # output, models_and_onnx_configs, onnx_files_subpaths + # ) + # except Exception as e: + # raise Exception( + # f"The post-processing of the ONNX export failed. The export can still be performed by passing the option --no-post-process. Detailed error: {e}" + # ) + + # if is_stable_diffusion: + # use_subprocess = ( + # False # TODO: fix Can't pickle local object 'get_stable_diffusion_models_for_export..' + # ) + # elif model.config.model_type in UNPICKABLE_ARCHS: + # # Pickling is bugged for nn.utils.weight_norm: https://github.com/pytorch/pytorch/issues/102983 + # # TODO: fix "Cowardly refusing to serialize non-leaf tensor" error for wav2vec2-conformer + # use_subprocess = False + + # if do_validation is True: + # try: + # validate_models_outputs( + # models_and_onnx_configs=models_and_onnx_configs, + # onnx_named_outputs=onnx_outputs, + # atol=atol, + # output_dir=output, + # onnx_files_subpaths=onnx_files_subpaths, + # input_shapes=input_shapes, + # device=device, + # dtype=torch_dtype, + # use_subprocess=use_subprocess, + # model_kwargs=model_kwargs, + # ) + # logger.info(f"The ONNX export succeeded and the exported model was saved at: {output.as_posix()}") + # except ShapeError as e: + # raise e + # except AtolError as e: + # logger.warning( + # f"The ONNX export succeeded with the warning: {e}.\n The exported model was saved at: {output.as_posix()}" + # ) + # except OutputMatchError as e: + # logger.warning( + # f"The ONNX export succeeded with the warning: {e}.\n The exported model was saved at: {output.as_posix()}" + # ) + # except Exception as e: + # raise Exception( + # f"An error occured during validation, but the model was saved nonetheless at {output.as_posix()}. Detailed error: {e}." + # ) diff --git a/optimum_benchmark/backends/pytorch.py b/optimum_benchmark/backends/pytorch.py deleted file mode 100644 index b6c84f181..000000000 --- a/optimum_benchmark/backends/pytorch.py +++ /dev/null @@ -1,451 +0,0 @@ -from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING -from dataclasses import dataclass -from logging import getLogger -import os -import gc - - -import torch -from torch import Tensor -from accelerate import init_empty_weights -from omegaconf import DictConfig, OmegaConf -from torch import __version__ as torch_version -from transformers.utils.fx import symbolic_trace -from transformers import Trainer, TrainingArguments -from optimum.bettertransformer import BetterTransformer -from transformers import BitsAndBytesConfig, GPTQConfig -from torch.distributed.elastic.multiprocessing.errors import record -from torch.distributed.launcher.api import elastic_launch, LaunchConfig - - -if TYPE_CHECKING: - from datasets import Dataset - from transformers.utils import ModelOutput - from transformers import TrainerState, TrainerCallback - - -from .base import Backend, BackendConfig -from ..profilers.fx_profiler import FXProfilingWrapper -from .utils.pytorch_utils import ( - DEFAULT_COMPILE_CONFIG, - DEFAULT_DDP_CONFIG, - randomize_weights, - get_worker_logger, -) - - -# bachend logger -LOGGER = getLogger("pytorch") - -# backend resolvers -OmegaConf.register_new_resolver( - "is_inference", lambda benchmark_name: benchmark_name == "inference" -) - - -@dataclass -class PyTorchConfig(BackendConfig): - name: str = "pytorch" - version: str = torch_version - _target_: str = "optimum_benchmark.backends.pytorch.PyTorchBackend" - - # load options - no_weights: bool = False - device_map: Optional[str] = None - torch_dtype: Optional[str] = None - - # quantization options - quantization_strategy: Optional[str] = None - quantization_config: Optional[Dict[str, Any]] = None - - # optimization options - bettertransformer: bool = False - - # compilation options - torch_compile: bool = False - torch_compile_kwargs: Optional[Dict] = None - - # amp options - amp_autocast: bool = False - amp_dtype: Optional[str] = None - - # inference options - disable_grad: bool = "${is_inference:${benchmark.name}}" # type: ignore - eval_mode: bool = "${is_inference:${benchmark.name}}" # type: ignore - - # training options - use_ddp: bool = False - ddp_config: Optional[Dict[str, Any]] = None - - def __post_init__(self): - """ - Here we perform checks and transformations on the config. - But we never modify the types of the config values. - """ - - CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None) - - if self.torch_compile: - self.torch_compile_kwargs = OmegaConf.merge( - self.torch_compile_kwargs - if self.torch_compile_kwargs is not None - else {}, - DEFAULT_COMPILE_CONFIG, - ) - - if self.device_map is not None: - assert self.device_map in ["auto", "sequential"], ( - "`device_map` must be one of ['auto', 'sequential']. " - "are supported in Optimum-Bnechmark. " - f"Got {type(self.device_map)} instead." - ) - assert ( - CUDA_VISIBLE_DEVICES is not None - ), "`device_map` can only be used when CUDA_VISIBLE_DEVICES is set." - - if self.torch_dtype is not None: - assert self.torch_dtype in ["bfloat16", "float16", "float32", "auto"], ( - "`torch_dtype` must be one of ['bfloat16', 'float16', 'float32', " - f"'auto']. Got {self.torch_dtype} instead." - ) - - if self.amp_dtype is not None: - assert self.amp_dtype in ["bfloat16", "float16", "float32"], ( - "`amp_dtype` must be one of ['bfloat16', 'float16', 'float32']. " - f"Got {self.amp_dtype} instead." - ) - - if self.quantization_strategy is not None: - assert self.quantization_strategy in ["bnb", "gptq"], ( - "`quantization_strategy` must be one of ['bnb', 'gptq']. " - f"Got {self.quantization_strategy} instead." - ) - if self.quantization_strategy == "gptq": - bits = self.quantization_config.get("bits", None) - assert bits is not None, ( - "`quantization_config.bits` must be provided " - "when using 'gptq' quantization strategy." - ) - else: - self.quantization_config = None - - if self.use_ddp: - self.ddp_config = OmegaConf.merge( - self.ddp_config if self.ddp_config is not None else {}, - DEFAULT_DDP_CONFIG, - ) - - # TODO: support multi-node training. - assert self.ddp_config.max_nodes == 1, ( - "Currently, PyTorch DDP training benchmark " - "only supports training on a single node." - ) - - assert ( - CUDA_VISIBLE_DEVICES is not None - ), "Pytorch DDP training benchmark requires CUDA_VISIBLE_DEVICES to be set." - else: - self.ddp_config = None - - -class PyTorchBackend(Backend): - name: str = "pytorch" - config: PyTorchConfig - - def __init__(self, model: str, task: str, device: str, hub_kwargs: DictConfig): - super().__init__(model, task, device, hub_kwargs) - self.device = torch.device(device) - - LOGGER.info( - f"\t+ Infered AutoModel class {self.automodel_class.__name__} " - f"for task {self.task} and model_type {self.model_type}" - ) - - def configure(self, config: PyTorchConfig) -> None: - super().configure(config) - - # environment options - if self.config.inter_op_num_threads is not None: - LOGGER.info( - "\t+ Setting pytorch " - f"inter_op_num_threads({self.config.inter_op_num_threads}))" - ) - torch.set_num_threads(self.config.inter_op_num_threads) - if self.config.intra_op_num_threads is not None: - LOGGER.info( - "\t+ Setting pytorch " - f"intra_op_num_threads({self.config.intra_op_num_threads}))" - ) - torch.set_num_interop_threads(self.config.intra_op_num_threads) - - # Load config - if self.config.torch_dtype is not None: - if hasattr(torch, self.config.torch_dtype): - self.config.torch_dtype = getattr(torch, self.config.torch_dtype) - - # Inference config - if self.config.disable_grad: - LOGGER.info("\t+ Disabling gradients") - # everything that comes after this will have its gradients disabled - torch.set_grad_enabled(False) - if self.config.amp_dtype is not None: - if hasattr(torch, self.config.amp_dtype): - self.config.amp_dtype = getattr(torch, self.config.amp_dtype) - - # Quantization config - if self.config.quantization_strategy is not None: - if self.config.quantization_strategy == "gptq": - self.config.quantization_config = GPTQConfig( - **self.config.quantization_config - ) - elif self.config.quantization_strategy == "bnb": - self.config.quantization_config = BitsAndBytesConfig( - **self.config.quantization_config - ) - - # Load model - if self.config.no_weights: - self.load_model_from_config() - else: - self.load_model_from_pretrained() - - # Turn on eval mode - if not self.is_diffusion_pipeline() and self.config.eval_mode: - LOGGER.info("\t+ Turning on eval mode") - self.pretrained_model.eval() - - # Turn on BetterTransformer optimizations - if self.config.bettertransformer: - LOGGER.info("\t+ Using optimum.bettertransformer") - self.pretrained_model = BetterTransformer.transform( - self.pretrained_model, - keep_original_model=False, - ) - - # Compile model - if self.config.torch_compile: - if self.is_diffusion_pipeline(): - LOGGER.info() - self.pretrained_model.unet = torch.compile( - self.pretrained_model.unet, - **self.config.torch_compile_kwargs, - ) - else: - LOGGER.info("\t+ Using torch.compile on forward pass") - self.pretrained_model.forward = torch.compile( - self.pretrained_model.forward, - **self.config.torch_compile_kwargs, - ) - - # DDP config - if self.config.use_ddp: - self.config.ddp_config = LaunchConfig(**self.config.ddp_config) - - def load_model_from_pretrained(self) -> None: - LOGGER.info(f"\t+ Loading pretrained model weights on device: {self.device}") - if self.is_diffusion_pipeline(): - self.pretrained_model = self.automodel_class.from_pretrained( - pretrained_model_name_or_path=self.model, - torch_dtype=self.config.torch_dtype, - device_map=self.config.device_map, - **self.hub_kwargs, - ) - if self.config.device_map is None: - # Diffusers does not support device_map being a torch.device, - # thus if not provided we move to device here. - self.pretrained_model.to(self.device) - else: - if self.config.device_map is not None: - self.pretrained_model = self.automodel_class.from_pretrained( - pretrained_model_name_or_path=self.model, - quantization_config=self.config.quantization_config, - torch_dtype=self.config.torch_dtype, - device_map=self.config.device_map, - **self.hub_kwargs, - ) - else: - with self.device: - self.pretrained_model = self.automodel_class.from_pretrained( - pretrained_model_name_or_path=self.model, - quantization_config=self.config.quantization_config, - torch_dtype=self.config.torch_dtype, - **self.hub_kwargs, - ) - - def load_model_from_config(self) -> None: - # TODO: create no_weights tests - - LOGGER.info("\t+ Initializing empty weights model on device: meta") - with init_empty_weights(): - self.pretrained_model = self.automodel_class.from_config( - config=self.pretrained_config, - torch_dtype=self.config.torch_dtype, - trust_remote_code=self.hub_kwargs.get("trust_remote_code", False), - ) - - if self.config.quantization_strategy is None: - LOGGER.info(f"\t+ Materializing model on device: {self.device}") - self.pretrained_model.to_empty(device=self.device) - - LOGGER.info("\t+ Randomizing model weights") - randomize_weights(self.pretrained_model) - self.pretrained_model.tie_weights() - else: - LOGGER.info("\t+ Materializing model on device: cpu") - self.pretrained_model.to_empty(device="cpu") - - LOGGER.info("\t+ Randomizing model weights while on device: cpu") - randomize_weights(self.pretrained_model) - self.pretrained_model.tie_weights() - - if self.config.quantization_strategy == "bnb": - quantization_config = BitsAndBytesConfig(**self.quantization_config) - elif self.config.quantization_strategy == "gptq": - raise NotImplementedError( - "GPTQ requires a pretrained model to be loaded. " - "`no_weights` option is not supported with GPTQ." - ) - - from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model - - # translating transformers bnb config to accelerate bnb config - bnb_quantization_config = BnbQuantizationConfig( - load_in_4bit=quantization_config.load_in_4bit, - load_in_8bit=quantization_config.load_in_8bit, - # with dummy_weights, we set this to 0 for reproducibility - llm_int8_threshold=0, - torch_dtype=self.config.torch_dtype, - keep_in_fp32_modules=self.pretrained_model.keep_in_fp32_modules - if hasattr(self.pretrained_model, "keep_in_fp32_modules") - else None, - ) - - LOGGER.info("\t+ Quantizing model while on cpu and dispatching to device") - self.pretrained_model = load_and_quantize_model( - model=self.pretrained_model, - bnb_quantization_config=bnb_quantization_config, - device_map=self.config.device_map - if self.config.device_map is not None - else self.device, - ) - - def prepare_for_profiling(self, input_names: List[str]) -> None: - LOGGER.info("Preparing model for profiling") - LOGGER.info("\t+ Symbolicly tracing model") - self.pretrained_model = symbolic_trace( - model=self.pretrained_model, - input_names=input_names, - ) - - LOGGER.info("\t+ Wrapping model with FXProfilingWrapper") - self.pretrained_model = FXProfilingWrapper(self.pretrained_model) - - def forward(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput": - with torch.autocast( - enabled=self.config.amp_autocast, - device_type=self.device.type, - dtype=self.config.amp_dtype, - ): - output = self.pretrained_model(**input, **kwargs) - - return output - - def generate(self, input: Dict[str, Tensor], **kwargs) -> "ModelOutput": - with torch.autocast( - enabled=self.config.amp_autocast, - device_type=self.device.type, - dtype=self.config.amp_dtype, - ): - output = self.pretrained_model.generate(**input, **kwargs) - - return output - - @record - def train( - self, - training_dataset: "Dataset", - training_arguments: Dict[str, Any], - training_callbacks: List["TrainerCallback"], - training_data_collator: Callable, - ) -> "TrainerState": - args = ( - self.config.use_ddp, - self.pretrained_model, - training_dataset, - training_arguments, - training_callbacks, - training_data_collator, - ) - - if self.config.use_ddp: - # For DDP, we log only the stats from the first rank as transformers does. - # It could make sense to log for all ranks. - results = elastic_launch( - config=self.config.ddp_config, - entrypoint=training_worker, - )(args)[0] - else: - # For DP, we can still use training_worker, - # simply not wrapped by the elastic_launch class. - results = training_worker(args) - - return results - - def clean(self) -> None: - super().clean() - - if self.device.type == "cuda": - torch.cuda.empty_cache() - gc.collect() - - -def training_worker(args) -> "TrainerState": - use_ddp = args[0] - pretrained_model = args[1] - training_dataset = args[2] - training_arguments = args[3] - training_callbacks = args[4] - training_data_collator = args[5] - - if use_ddp: - LOGGER_WORKER = get_worker_logger("pytorch-ddp-worker", log_all=False) - - env_variables = [ - "RANK", - "WORLD_SIZE", - "MASTER_ADDR", - "MASTER_PORT", - "TORCHELASTIC_MAX_RESTARTS", - ] - - LOGGER_WORKER.info("Initializing DDP worker") - for env_var in env_variables: - LOGGER_WORKER.info(f"{env_var}: {os.environ.get(env_var)}") - else: - LOGGER_WORKER = LOGGER - - LOGGER_WORKER.info("\t+ Setting dataset format to `torch`.") - training_dataset.set_format( - type="torch", columns=list(training_dataset.features.keys()) - ) - - LOGGER_WORKER.info( - "\t+ Wrapping training arguments with transformers.TrainingArguments" - ) - training_arguments = TrainingArguments(**training_arguments) - - LOGGER_WORKER.info("\t+ Wrapping model with transformers.Trainer") - trainer = Trainer( - model=pretrained_model, - args=training_arguments, - callbacks=training_callbacks, - train_dataset=training_dataset, - data_collator=training_data_collator, - ) - - LOGGER_WORKER.info("\t+ Starting training") - trainer.train() - LOGGER_WORKER.info("\t+ Training finished successfully") - trainer_state = trainer.state - - return trainer_state diff --git a/optimum_benchmark/backends/pytorch/__init__.py b/optimum_benchmark/backends/pytorch/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/optimum_benchmark/backends/pytorch/backned.py b/optimum_benchmark/backends/pytorch/backned.py new file mode 100644 index 000000000..a83482da0 --- /dev/null +++ b/optimum_benchmark/backends/pytorch/backned.py @@ -0,0 +1,265 @@ +import gc +import os +from logging import getLogger +from typing import TYPE_CHECKING, Any, Callable, Dict, List + +import torch +from accelerate import init_empty_weights +from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model +from optimum.bettertransformer import BetterTransformer +from torch.distributed.elastic.multiprocessing.errors import record +from torch.distributed.launcher.api import LaunchConfig, elastic_launch +from transformers import BitsAndBytesConfig, GPTQConfig, Trainer, TrainingArguments +from transformers.utils.fx import symbolic_trace + +if TYPE_CHECKING: + from datasets import Dataset + from transformers import TrainerCallback, TrainerState + from transformers.utils import ModelOutput + +from ...profilers.fx_profiler import FXProfilingWrapper +from ..base import Backend +from .config import PyTorchConfig +from .utils import get_worker_logger, randomize_weights + +# bachend logger +LOGGER = getLogger("pytorch") + + +class PyTorchBackend(Backend[PyTorchConfig]): + NAME: str = "pytorch" + + def __init__(self, model: str, task: str, device: str, hub_kwargs: Dict[str, Any]): + super().__init__(model, task, device, hub_kwargs) + + automodel = self.automodel_class.__name__ + LOGGER.info(f"\t+ Infered AutoModel class {automodel} for task {self.task} and model_type {self.model_type}") + + def configure(self, config: PyTorchConfig) -> None: + super().configure(config) + + # Gradients options + if self.config.disable_grad: + LOGGER.info("\t+ Disabling gradients") + torch.set_grad_enabled(False) + + # Threading options + if self.config.inter_op_num_threads is not None: + LOGGER.info(f"\t+ Setting pytorch inter_op_num_threads({self.config.inter_op_num_threads}))") + torch.set_num_threads(self.config.inter_op_num_threads) + if self.config.intra_op_num_threads is not None: + LOGGER.info(f"\t+ Setting pytorch intra_op_num_threads({self.config.intra_op_num_threads}))") + torch.set_num_interop_threads(self.config.intra_op_num_threads) + + # Dtypes options + self.torch_dtype = getattr(torch, self.config.torch_dtype) if self.config.torch_dtype is not None else None + self.amp_dtype = getattr(torch, self.config.amp_dtype) if self.config.amp_dtype is not None else None + + # Load model + if self.config.no_weights: + self.load_model_from_config() + else: + self.load_model_from_pretrained() + + # Eval mode + if self.config.eval_mode: + if self.is_diffusion_pipeline(): + LOGGER.info("\t+ Diffusion pipeline are always in eval mode") + else: + LOGGER.info("\t+ Turning on model's eval mode") + self.pretrained_model.eval() + + # BetterTransformer + if self.config.bettertransformer: + LOGGER.info("\t+ Using optimum.bettertransformer") + self.pretrained_model = BetterTransformer.transform( + self.pretrained_model, + keep_original_model=False, + ) + + # Compile model + if self.config.torch_compile: + if self.is_diffusion_pipeline(): + LOGGER.info("\t+ Using torch.compile on unet forward pass") + # TODO: should we compile vae and/or clip as well ? + self.pretrained_model.unet.forward = torch.compile( + self.pretrained_model.unet.forward, + **self.config.torch_compile_kwargs, + ) + else: + LOGGER.info("\t+ Using torch.compile on forward pass") + self.pretrained_model.forward = torch.compile( + self.pretrained_model.forward, + **self.config.torch_compile_kwargs, + ) + + def load_model_from_pretrained(self) -> None: + if self.config.quantization_strategy == "gptq": + LOGGER.info("\t+ Processing GPTQ config") + quantization_config = GPTQConfig(**self.config.quantization_config) + elif self.config.quantization_strategy == "bnb": + LOGGER.info("\t+ Processing BnB config") + quantization_config = BitsAndBytesConfig(**self.config.quantization_config) + else: + quantization_config = None + + if self.is_diffusion_pipeline(): + LOGGER.info("\t+ Loading diffusion pipeline") + self.pretrained_model = self.automodel_class.from_pretrained( + self.model, + torch_dtype=self.torch_dtype, + device_map=self.config.device_map, + **self.hub_kwargs, + ) + if self.config.device_map is None: + LOGGER.info(f"\t+ Moving diffusion pipeline to device: {self.device}") + # Diffusers does not support loading with torch.device context manager + self.pretrained_model.to(self.device) + else: + if self.config.device_map is not None: + LOGGER.info(f"\t+ Loading model on visible cuda devices with device_map: {self.config.device_map}") + self.pretrained_model = self.automodel_class.from_pretrained( + self.model, + torch_dtype=self.torch_dtype, + device_map=self.config.device_map, + quantization_config=quantization_config, + **self.hub_kwargs, + ) + else: + LOGGER.info(f"\t+ Loading model on device: {self.device}") + with self.device: + self.pretrained_model = self.automodel_class.from_pretrained( + self.model, + torch_dtype=self.torch_dtype, + quantization_config=quantization_config, + **self.hub_kwargs, + ) + + def load_model_from_config(self) -> None: + # TODO: create no_weights tests + LOGGER.info("\t+ Initializing empty weights model on device: meta") + with init_empty_weights(): + self.pretrained_model = self.automodel_class.from_config( + config=self.pretrained_config, + torch_dtype=self.config.torch_dtype, + trust_remote_code=self.hub_kwargs.get("trust_remote_code", False), + ) + + if self.config.quantization_strategy is not None: + LOGGER.info("\t+ Materializing model on cpu for quantization to not OOM") + self.pretrained_model.to_empty(device="cpu") + LOGGER.info("\t+ Randomizing model weights") + randomize_weights(self.pretrained_model) + LOGGER.info("\t+ Processing BnB config") + bnb_quantization_config = BnbQuantizationConfig( + **self.config.quantization_config, + torch_dtype=self.config.torch_dtype, + keep_in_fp32_modules=self.pretrained_model.keep_in_fp32_modules + if hasattr(self.pretrained_model, "keep_in_fp32_modules") + else None, + ) + LOGGER.info("\t+ Quantizing model while on cpu and dispatching to device") + self.pretrained_model = load_and_quantize_model( + self.pretrained_model, bnb_quantization_config, device_map=self.config.device_map or self.device + ) + else: + LOGGER.info(f"\t+ Materializing model on device: {self.device}") + self.pretrained_model.to_empty(device=self.device) + LOGGER.info("\t+ Randomizing model weights") + randomize_weights(self.pretrained_model) + + LOGGER.info("\t+ Tying weights") + self.pretrained_model.tie_weights() + + def prepare_for_profiling(self, input_names: List[str]) -> None: + LOGGER.info("Preparing model for profiling") + LOGGER.info("\t+ Symbolicly tracing model") + self.pretrained_model = symbolic_trace(self.pretrained_model, input_names=input_names) + LOGGER.info("\t+ Wrapping model with FXProfilingWrapper") + self.pretrained_model = FXProfilingWrapper(self.pretrained_model) + + def forward(self, input: Dict[str, torch.Tensor], **kwargs) -> "ModelOutput": + if self.is_diffusion_pipeline(): + return super().forward(input, **kwargs) + else: + # TODO: autocast as whole can be managed by one config/kwargs + with torch.autocast(device_type=self.device.type, dtype=self.amp_dtype, enabled=self.config.amp_autocast): + return super().forward(input, **kwargs) + + def generate(self, input: Dict[str, torch.Tensor], **kwargs) -> "ModelOutput": + if self.is_diffusion_pipeline(): + return super().generate(input, **kwargs) + else: + # TODO: autocast as whole can be managed by one config/kwargs + with torch.autocast(device_type=self.device.type, dtype=self.amp_dtype, enabled=self.config.amp_autocast): + return super().generate(input, **kwargs) + + @record + def train( + self, + training_dataset: "Dataset", + training_arguments: Dict[str, Any], + training_callbacks: List["TrainerCallback"], + training_data_collator: Callable, + ) -> "TrainerState": + args = ( + self.config.use_ddp, + self.pretrained_model, + training_dataset, + training_arguments, + training_callbacks, + training_data_collator, + ) + + if self.config.use_ddp: + # For DDP, we log only the state of the first rank as transformers does. + # since the batch size used in measuring the throughput is the one of world size. + ddp_config = LaunchConfig(**self.config.ddp_config) + results = elastic_launch(config=ddp_config, entrypoint=training_worker)(args)[0] + else: + # For DP, we can still use training_worker, simply not wrapped by the elastic_launch class. + results = training_worker(args) + + return results + + def clean(self) -> None: + super().clean() + + if self.device.type == "cuda": + torch.cuda.empty_cache() + gc.collect() + + +def training_worker(args) -> "TrainerState": + use_ddp = args[0] + pretrained_model = args[1] + training_dataset = args[2] + training_arguments = args[3] + training_callbacks = args[4] + training_data_collator = args[5] + + if use_ddp: + LOGGER_WORKER = get_worker_logger("pytorch-ddp-worker", log_all=False) + env_variables = ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "TORCHELASTIC_MAX_RESTARTS"] + LOGGER_WORKER.info("Initializing DDP worker") + for env_var in env_variables: + LOGGER_WORKER.info(f"{env_var}: {os.environ.get(env_var)}") + else: + LOGGER_WORKER = LOGGER + + LOGGER_WORKER.info("\t+ Setting dataset format to `torch`.") + training_dataset.set_format(type="torch", columns=list(training_dataset.features.keys())) + LOGGER_WORKER.info("\t+ Wrapping training arguments with transformers.TrainingArguments") + training_arguments = TrainingArguments(**training_arguments) + LOGGER_WORKER.info("\t+ Wrapping model with transformers.Trainer") + trainer = Trainer( + model=pretrained_model, + args=training_arguments, + callbacks=training_callbacks, + train_dataset=training_dataset, + data_collator=training_data_collator, + ) + LOGGER_WORKER.info("\t+ Starting training") + trainer.train() + LOGGER_WORKER.info("\t+ Training finished successfully") + return trainer.state diff --git a/optimum_benchmark/backends/pytorch/config.py b/optimum_benchmark/backends/pytorch/config.py new file mode 100644 index 000000000..ab2cc8fa9 --- /dev/null +++ b/optimum_benchmark/backends/pytorch/config.py @@ -0,0 +1,143 @@ +import importlib.metadata +import os +from dataclasses import dataclass, field +from typing import Any, Dict, Optional + +from omegaconf import OmegaConf + +from ..base import BackendConfig + +OmegaConf.register_new_resolver( + "device_count", + lambda: len(os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",")), +) +OmegaConf.register_new_resolver( + "is_inference", + lambda benchmark_name: benchmark_name == "inference", +) +OmegaConf.register_new_resolver( + "pytorch_version", + lambda: importlib.metadata.version("torch"), +) + +DEVICE_MAPS = ["auto", "sequential"] +AMP_DTYPES = ["bfloat16", "float16"] +TORCH_DTYPES = ["bfloat16", "float16", "float32", "auto"] + +GPTQ_CONFIG = { + "bits": 4, +} +BNB_CONFIG = { + "load_in_8bit": False, + "load_in_4bit": False, + "llm_int8_threshold": 0.0, +} +QUANTIZATION_CONFIGS = { + "gptq": GPTQ_CONFIG, + "bnb": BNB_CONFIG, +} +COMPILE_CONFIG = { + "fullgraph": False, + "dynamic": False, + "backend": "inductor", + "mode": None, + "options": None, + "disable": False, +} +# from launchConfig in https://github.com/pytorch/pytorch/blob/v2.0.0/torch/distributed/launcher/api.py#L29 adjusted +# to defaults of torch.distributed.run in https://github.com/pytorch/pytorch/blob/v2.0.0/torch/distributed/run.py#L770 +DDP_CONFIG = { + "min_nodes": 1, + "max_nodes": 1, + "run_id": "none", + "nproc_per_node": "${device_count:}", + "role": "default", + "rdzv_endpoint": "127.0.0.1:29500", + "rdzv_backend": "static", + "rdzv_configs": { + "timeout": 900, + "rank": 0, + }, + "max_restarts": 0, + "monitor_interval": 5, + "start_method": "spawn", + "log_dir": None, + "metrics_cfg": {}, + "local_addr": None, +} + + +@dataclass +class PyTorchConfig(BackendConfig): + name: str = "pytorch" + version: str = "${pytorch_version:}" + _target_: str = "optimum_benchmark.backends.pytorch.backned.PyTorchBackend" + + # load options + no_weights: bool = False + device_map: Optional[str] = None + torch_dtype: Optional[str] = None + + # inference options + disable_grad: bool = "${is_inference:${benchmark.name}}" + eval_mode: bool = "${is_inference:${benchmark.name}}" + + # automatic mixed precision options + amp_autocast: bool = False + amp_dtype: Optional[str] = None + + # compilation options + torch_compile: bool = False + torch_compile_config: Dict[str, Any] = field(default_factory=dict) + + # optimization options + bettertransformer: bool = False + + # quantization options + quantization_strategy: Optional[str] = None + quantization_config: Dict[str, Any] = field(default_factory=dict) + + # training options + use_ddp: bool = False + ddp_config: Dict[str, Any] = field(default_factory=dict) + + def __post_init__(self): + CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None) + + if self.torch_compile: + self.torch_compile_config = OmegaConf.to_container( + OmegaConf.merge(COMPILE_CONFIG, self.torch_compile_config) + ) + + if self.device_map is not None: + assert CUDA_VISIBLE_DEVICES is not None, "`device_map` can only be used when CUDA_VISIBLE_DEVICES is set." + + if self.device_map not in DEVICE_MAPS: + raise ValueError(f"`device_map` must be one of {DEVICE_MAPS}. Got {self.device_map} instead.") + + if self.torch_dtype is not None: + if self.torch_dtype not in TORCH_DTYPES: + raise ValueError(f"`torch_dtype` must be one of {TORCH_DTYPES}. Got {self.torch_dtype} instead.") + + if self.amp_dtype is not None: + if self.amp_dtype not in AMP_DTYPES: + raise ValueError(f"`amp_dtype` must be one of {AMP_DTYPES}. Got {self.amp_dtype} instead.") + + if self.quantization_strategy is not None: + if self.quantization_strategy not in QUANTIZATION_CONFIGS: + raise ValueError( + f"`quantization_strategy` must be one of {list(QUANTIZATION_CONFIGS.keys())}. Got {self.quantization_strategy} instead." + ) + QUANTIZATION_CONFIG = QUANTIZATION_CONFIGS[self.quantization_strategy] + self.quantization_config = OmegaConf.to_container( + OmegaConf.merge(QUANTIZATION_CONFIG, self.quantization_config) + ) + + if self.use_ddp: + if CUDA_VISIBLE_DEVICES is None: + raise ValueError("`use_ddp` can only be used when CUDA_VISIBLE_DEVICES is set.") + + self.ddp_config = OmegaConf.to_container(OmegaConf.merge(DDP_CONFIG, self.ddp_config), resolve=True) + # TODO: check if it's not possible to use DDP with multiple nodes + if self.ddp_config["max_nodes"] > 1 or self.ddp_config["min_nodes"] > 1: + raise NotImplementedError("Currently, PyTorch DDP benchmark only supports training on a single node.") diff --git a/optimum_benchmark/backends/pytorch/utils.py b/optimum_benchmark/backends/pytorch/utils.py new file mode 100644 index 000000000..38cecdf5e --- /dev/null +++ b/optimum_benchmark/backends/pytorch/utils.py @@ -0,0 +1,35 @@ +import logging.config +import os +from logging import getLogger +from typing import Optional + +import torch +from omegaconf import OmegaConf + + +def randomize_weights(model): + for param in model.parameters(): + if torch.cuda.is_available() and param.device.type == "cpu": + # we take advantage of the fact that a cuda device + # is available to use cuda kernels for randomization + # this is slower than asynchronous randomization while + # model is fully on gpu (because of data transfer) but + # faster than randomization while model is on cpu + param.data.cuda().normal_(mean=0.0, std=0.2).cpu() + else: + param.data.normal_(mean=0.0, std=0.2) + + +def get_worker_logger( + name: Optional[str] = None, + log_all: bool = False, +) -> logging.Logger: + """PyTorch DDP subprocesses do not inherit from Hydra logger. + Thus, we need to reconfigure the logger for the workers. + """ + if os.environ["RANK"] == "0" or log_all: + # TODO: also configure logging for other ranks + hydra_conf = OmegaConf.load(".hydra/hydra.yaml") + logging.config.dictConfig(OmegaConf.to_container(hydra_conf.hydra.job_logging, resolve=True)) + + return getLogger(name) diff --git a/optimum_benchmark/backends/utils.py b/optimum_benchmark/backends/utils.py new file mode 100644 index 000000000..38df49a93 --- /dev/null +++ b/optimum_benchmark/backends/utils.py @@ -0,0 +1,176 @@ +import os +import signal +import subprocess +import time +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +if TYPE_CHECKING: + from transformers import ( + FeatureExtractionMixin, + ImageProcessingMixin, + Pipeline, + PretrainedConfig, + PreTrainedTokenizer, + ProcessorMixin, + ) + + PreTrainedProcessor = Union[ + PreTrainedTokenizer, + ImageProcessingMixin, + FeatureExtractionMixin, + ProcessorMixin, + ] + + +def extract_shapes_from_diffusion_pipeline(pipeline: "Pipeline") -> Dict[str, Any]: + # this is the only way I found to extract a diffusion pipeline's "input" shapes + shapes = {} + if hasattr(pipeline, "vae_encoder") and hasattr(pipeline.vae_encoder, "config"): + shapes["num_channels"] = pipeline.vae_encoder.config["out_channels"] + shapes["height"] = pipeline.vae_encoder.config["sample_size"] + shapes["width"] = pipeline.vae_encoder.config["sample_size"] + elif hasattr(pipeline, "vae") and hasattr(pipeline.vae, "config"): + shapes["num_channels"] = pipeline.vae.config.out_channels + shapes["height"] = pipeline.vae.config.sample_size + shapes["width"] = pipeline.vae.config.sample_size + else: + shapes["num_channels"] = -1 + shapes["height"] = -1 + shapes["width"] = -1 + + return shapes + + +def extract_shapes_from_model_artifacts( + config: "PretrainedConfig", processor: Optional["PreTrainedProcessor"] = None +) -> Dict[str, Any]: + shapes = {} + artifacts_dict = {} + + config_dict = {k: v for k, v in config.to_dict().items() if v is not None} + artifacts_dict.update(config_dict) + + if processor is not None and hasattr(processor, "to_dict"): + processor_dict = {k: v for k, v in processor.to_dict().items() if v is not None} + artifacts_dict.update(processor_dict) + + # text input + shapes["vocab_size"] = artifacts_dict.get("vocab_size", 2) + shapes["type_vocab_size"] = artifacts_dict.get("type_vocab_size", 2) + + # image input + shapes["num_channels"] = artifacts_dict.get("num_channels", None) + + image_size = artifacts_dict.get("image_size", None) + if image_size is None: + # processors have different names for the image size + image_size = artifacts_dict.get("size", None) + + if isinstance(image_size, (int, float)): + shapes["height"] = image_size + shapes["width"] = image_size + elif isinstance(image_size, (list, tuple)): + shapes["height"] = image_size[0] + shapes["width"] = image_size[0] + elif isinstance(image_size, dict) and len(image_size) == 2: + shapes["height"] = list(image_size.values())[0] + shapes["width"] = list(image_size.values())[1] + elif isinstance(image_size, dict) and len(image_size) == 1: + shapes["height"] = list(image_size.values())[0] + shapes["width"] = list(image_size.values())[0] + else: + shapes["height"] = None + shapes["width"] = None + + # classification labels (default to 2) + shapes["num_labels"] = len(artifacts_dict.get("id2label", {"0": "LABEL_0", "1": "LABEL_1"})) + + # object detection labels (default to 2) + shapes["num_queries"] = artifacts_dict.get("num_queries", 2) + + return shapes + + +def check_no_process_is_running_on_cuda_device(device_ids: List[int]) -> None: + """Raises a RuntimeError if any process is running on the given cuda device.""" + for device_id in device_ids: + # get list of all PIDs running on nvidia devices + pids = [ + int(pid) + for pid in subprocess.check_output(["nvidia-smi", "--query-compute-apps=pid", "--format=csv,noheader"]) + .decode() + .strip() + .split("\n") + if pid != "" + ] + + # get list of PIDs running on cuda device_id + pids_on_device_id = { + pid + for pid in pids + if subprocess.check_output( + [ + "nvidia-smi", + "--query-compute-apps=pid,used_memory", + "--format=csv,noheader,nounits", + f"--id={device_id}", + ] + ) + .decode() + .startswith(f"{pid},") + } + + # TODO: It would be safer to run each run of a sweep in a subprocess. + # Although we can trust PyTorch to clear GPU memory when asked, + # it is not a safe assumption to make for all backends. + if len(pids_on_device_id) > 1 or (len(pids_on_device_id) == 1 and os.getpid() not in pids_on_device_id): + raise RuntimeError( + f"Expected no processes on device {device_id}, " + f"found {len(pids_on_device_id)} processes " + f"with PIDs {pids_on_device_id}." + ) + + +def check_only_this_process_is_running_on_cuda_device(device_ids: List[int], pid) -> None: + """Raises a RuntimeError if at any point in time, there is a process running + on the given cuda device that is not the current process. + """ + while True: + # get list of all PIDs running on nvidia devices + pids = [ + int(pid) + for pid in subprocess.check_output(["nvidia-smi", "--query-compute-apps=pid", "--format=csv,noheader"]) + .decode() + .strip() + .split("\n") + if pid != "" + ] + + for device_id in device_ids: + # get list of PIDs running on cuda device_id + pids_on_device_id = { + pid + for pid in pids + if subprocess.check_output( + [ + "nvidia-smi", + "--query-compute-apps=pid,used_memory", + "--format=csv,noheader,nounits", + f"--id={device_id}", + ] + ) + .decode() + .startswith(f"{pid},") + } + + # check if there is a process running on device_id that is not the current process + if len(pids_on_device_id) > 1: + os.kill(pid, signal.SIGTERM) + raise RuntimeError( + f"Expected only process {pid} on device {device_id}, " + f"found {len(pids_on_device_id)} processes " + f"with PIDs {pids_on_device_id}." + ) + + # sleep for 1 second + time.sleep(1) diff --git a/optimum_benchmark/backends/utils/base_utils.py b/optimum_benchmark/backends/utils/base_utils.py deleted file mode 100644 index 7f357be9d..000000000 --- a/optimum_benchmark/backends/utils/base_utils.py +++ /dev/null @@ -1,92 +0,0 @@ -from typing import Any, Dict, Optional, Union - -from diffusers import DiffusionPipeline -from transformers import ( - ProcessorMixin, - PretrainedConfig, - PreTrainedTokenizer, - ImageProcessingMixin, - FeatureExtractionMixin, -) - - -PreTrainedProcessor = Union[ - PreTrainedTokenizer, - ImageProcessingMixin, - FeatureExtractionMixin, - ProcessorMixin, -] - - -def extract_shapes_from_diffusion_pipeline( - pipeline: DiffusionPipeline, -) -> Dict[str, Any]: - # this is the only way I found to extract a diffusion pipeline's "input" shapes - shapes = {} - if hasattr(pipeline, "vae_encoder") and hasattr(pipeline.vae_encoder, "config"): - shapes["num_channels"] = pipeline.vae_encoder.config["out_channels"] - shapes["height"] = pipeline.vae_encoder.config["sample_size"] - shapes["width"] = pipeline.vae_encoder.config["sample_size"] - elif hasattr(pipeline, "vae") and hasattr(pipeline.vae, "config"): - shapes["num_channels"] = pipeline.vae.config.out_channels - shapes["height"] = pipeline.vae.config.sample_size - shapes["width"] = pipeline.vae.config.sample_size - else: - shapes["num_channels"] = -1 - shapes["height"] = -1 - shapes["width"] = -1 - - return shapes - - -def extract_shapes_from_model_artifacts( - config: PretrainedConfig, - processor: Optional[PreTrainedProcessor] = None, -) -> Dict[str, Any]: - shapes = {} - artifacts_dict = {} - - config_dict = {k: v for k, v in config.to_dict().items() if v is not None} - artifacts_dict.update(config_dict) - - if processor is not None and hasattr(processor, "to_dict"): - processor_dict = {k: v for k, v in processor.to_dict().items() if v is not None} - artifacts_dict.update(processor_dict) - - # text input - shapes["vocab_size"] = artifacts_dict.get("vocab_size", 2) - shapes["type_vocab_size"] = artifacts_dict.get("type_vocab_size", 2) - - # image input - shapes["num_channels"] = artifacts_dict.get("num_channels", None) - - image_size = artifacts_dict.get("image_size", None) - if image_size is None: - # processors have different names for the image size - image_size = artifacts_dict.get("size", None) - - if isinstance(image_size, (int, float)): - shapes["height"] = image_size - shapes["width"] = image_size - elif isinstance(image_size, (list, tuple)): - shapes["height"] = image_size[0] - shapes["width"] = image_size[0] - elif isinstance(image_size, dict) and len(image_size) == 2: - shapes["height"] = list(image_size.values())[0] - shapes["width"] = list(image_size.values())[1] - elif isinstance(image_size, dict) and len(image_size) == 1: - shapes["height"] = list(image_size.values())[0] - shapes["width"] = list(image_size.values())[0] - else: - shapes["height"] = None - shapes["width"] = None - - # classification labels (default to 2) - shapes["num_labels"] = len( - artifacts_dict.get("id2label", {"0": "LABEL_0", "1": "LABEL_1"}) - ) - - # object detection labels (default to 2) - shapes["num_queries"] = artifacts_dict.get("num_queries", 2) - - return shapes diff --git a/optimum_benchmark/backends/utils/neural_compressor_utils.py b/optimum_benchmark/backends/utils/neural_compressor_utils.py deleted file mode 100644 index 96632df48..000000000 --- a/optimum_benchmark/backends/utils/neural_compressor_utils.py +++ /dev/null @@ -1,39 +0,0 @@ -DEFAULT_QUANTIZATION_CONFIG = { - "device": "cpu", - "backend": "default", - "domain": "auto", - "recipes": {}, - "quant_format": "default", - "inputs": [], - "outputs": [], - "approach": "static", - "calibration_sampling_size": [100], - "op_type_dict": None, - "op_name_dict": None, - "reduce_range": None, - "example_inputs": None, - "excluded_precisions": [], - "quant_level": "auto", - "accuracy_criterion": { - "higher_is_better": True, - "criterion": "relative", - "tolerable_loss": 0.01, - }, - "tuning_criterion": { - "strategy": "basic", - "strategy_kwargs": None, - "timeout": 0, - "max_trials": 100, - "objective": "performance", - }, - "diagnosis": False, -} - -DEFAULT_CALIBRATION_CONFIG = { - "dataset_name": "glue", - "num_samples": 300, - "dataset_config_name": "sst2", - "dataset_split": "train", - "preprocess_batch": True, - "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor", -} diff --git a/optimum_benchmark/backends/utils/onnxruntime_utils.py b/optimum_benchmark/backends/utils/onnxruntime_utils.py deleted file mode 100644 index 65568458a..000000000 --- a/optimum_benchmark/backends/utils/onnxruntime_utils.py +++ /dev/null @@ -1,94 +0,0 @@ -from typing import Any, Dict - - -DEFAULT_OPTIMIZATION_CONFIG = { - "optimization_level": 1, # 0, 1, 2, 99 - "optimize_for_gpu": "${is_gpu:${device}}", - "fp16": False, - "enable_transformers_specific_optimizations": True, - "enable_gelu_approximation": False, - "disable_gelu_fusion": False, - "disable_layer_norm_fusion": False, - "disable_attention_fusion": False, - "disable_skip_layer_norm_fusion": True, - "disable_bias_skip_layer_norm_fusion": False, - "disable_bias_gelu_fusion": False, - "use_mask_index": False, - "no_attention_mask": False, - "disable_embed_layer_norm_fusion": True, - "disable_shape_inference": False, - "use_multi_head_attention": False, - "enable_gemm_fast_gelu_fusion": False, - "use_raw_attention_mask": False, - "disable_group_norm_fusion": True, - "disable_packed_kv": True, -} - -DEFAULT_QUANTIZATION_CONFIG = { - "is_static": False, - "format": "QOperator", # QOperator, QDQ - "mode": "IntegerOps", # QLinearOps, IntegerOps - "activations_dtype": "QUInt8", # QInt8, QUInt8 - "activations_symmetric": False, - "weights_dtype": "QInt8", # QInt8, QUInt8 - "weights_symmetric": True, - "per_channel": False, - "reduce_range": False, - "operators_to_quantize": [ - "MatMul", - "Add", - ], -} - -DEFAULT_CALIBRATION_CONFIG = { - "dataset_name": "glue", - "num_samples": 300, - "dataset_config_name": "sst2", - "dataset_split": "train", - "preprocess_batch": True, - "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor", -} - - -def infer_device_id(device: str) -> int: - """ - Infer the device id from the given device string. - """ - - import torch - - if device == "cuda": - return torch.cuda.current_device() - elif torch.device(device).type == "cuda": - return torch.device(device).index - elif torch.device(device).type == "cpu": - return -1 - else: - raise ValueError(f"Unknown device '{device}'") - - -def format_ort_quantization_dict(quantization_dict: Dict[str, Any]) -> None: - """ - Format the quantization dictionary for onnxruntime. - """ - - from onnxruntime.quantization import QuantFormat, QuantizationMode, QuantType - - if quantization_dict.get("format", None) is not None: - quantization_dict["format"] = QuantFormat.from_string( - quantization_dict["format"] - ) - if quantization_dict.get("mode", None) is not None: - quantization_dict["mode"] = QuantizationMode.from_string( - quantization_dict["mode"] - ) - if quantization_dict.get("activations_dtype", None) is not None: - quantization_dict["activations_dtype"] = QuantType.from_string( - quantization_dict["activations_dtype"] - ) - if quantization_dict.get("weights_dtype", None) is not None: - quantization_dict["weights_dtype"] = QuantType.from_string( - quantization_dict["weights_dtype"] - ) - - return quantization_dict diff --git a/optimum_benchmark/backends/utils/openvino_utils.py b/optimum_benchmark/backends/utils/openvino_utils.py deleted file mode 100644 index 0f1037b77..000000000 --- a/optimum_benchmark/backends/utils/openvino_utils.py +++ /dev/null @@ -1,14 +0,0 @@ -DEFAULT_QUANTIZATION_CONFIG = { - "compression": None, - "input_info": None, - "save_onnx_model": False, -} - -DEFAULT_CALIBRATION_CONFIG = { - "dataset_name": "glue", - "num_samples": 300, - "dataset_config_name": "sst2", - "dataset_split": "train", - "preprocess_batch": True, - "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor", -} diff --git a/optimum_benchmark/backends/utils/pytorch_utils.py b/optimum_benchmark/backends/utils/pytorch_utils.py deleted file mode 100644 index 04a2dbecb..000000000 --- a/optimum_benchmark/backends/utils/pytorch_utils.py +++ /dev/null @@ -1,78 +0,0 @@ -from logging import getLogger -from typing import Optional -import logging.config -import os - -import torch -from omegaconf import OmegaConf -from torch.distributed.elastic.multiprocessing import Std - -OmegaConf.register_new_resolver("device_count", lambda: torch.cuda.device_count()) - - -DEFAULT_COMPILE_CONFIG = { - "fullgraph": False, - "dynamic": False, - "backend": "inductor", - "mode": None, - "options": None, - "disable": False, -} - -# from https://github.com/pytorch/pytorch/blob/v2.0.0/torch/distributed/launcher/api.py#L29 -# adjusted to the defaults of torch.distributed.run -# defined in https://github.com/pytorch/pytorch/blob/v2.0.0/torch/distributed/run.py#L770 -# TODO: decide wrther to use torch.distributed.run arguments or the ones from -# torch.distributed.launcher.api -DEFAULT_DDP_CONFIG = { - "min_nodes": 1, - "max_nodes": 1, - "run_id": "none", - "nproc_per_node": "${device_count:}", - "role": "default", - "rdzv_endpoint": "127.0.0.1:29500", - "rdzv_backend": "static", - "rdzv_configs": { - "timeout": 900, - "rank": 0, - }, - "max_restarts": 0, - "monitor_interval": 5, - "start_method": "spawn", - "log_dir": None, - "metrics_cfg": {}, - "local_addr": None, - "redirects": Std.NONE, - "tee": Std.NONE, -} - - -def randomize_weights(model): - for param in model.parameters(): - if torch.cuda.is_available() and param.device.type == "cpu": - # we take advantage of the fact that a cuda device - # is available to use cuda kernels for randomization - # this is slower than asynchronous randomization while - # model is fully on gpu (because of data transfer) but - # faster than randomization while model is on cpu - param.data.cuda().normal_(mean=0.0, std=0.2).cpu() - else: - param.data.normal_(mean=0.0, std=0.2) - - -def get_worker_logger( - name: Optional[str] = None, - log_all: bool = False, -) -> logging.Logger: - """ - PyTorch DDP subprocesses do not inherit from Hydra logger. - Thus, we need to reconfigure the logger for the workers. - """ - if os.environ["RANK"] == "0" or log_all: - # TODO: also configure logging for other ranks - hydra_conf = OmegaConf.load(".hydra/hydra.yaml") - logging.config.dictConfig( - OmegaConf.to_container(hydra_conf.hydra.job_logging, resolve=True) - ) - - return getLogger(name) diff --git a/optimum_benchmark/benchmarks/base.py b/optimum_benchmark/benchmarks/base.py index da2721e5d..24cc27961 100644 --- a/optimum_benchmark/benchmarks/base.py +++ b/optimum_benchmark/benchmarks/base.py @@ -1,10 +1,10 @@ +from abc import ABC from dataclasses import dataclass from logging import getLogger -from abc import ABC +from typing import ClassVar, Generic, TypeVar from optimum_benchmark.backends.base import Backend - LOGGER = getLogger("benchmark") @@ -14,15 +14,19 @@ class BenchmarkConfig(ABC): _target_: str -class Benchmark(ABC): - name: str - config: BenchmarkConfig +BenchmarkConfigT = TypeVar("BenchmarkConfigT", bound=BenchmarkConfig) + + +class Benchmark(Generic[BenchmarkConfigT], ABC): + NAME: ClassVar[str] + + config: BenchmarkConfigT def __init__(self) -> None: pass - def configure(self, config: BenchmarkConfig) -> None: - LOGGER.info(f"Configuring {self.name} benchmark") + def configure(self, config: BenchmarkConfigT) -> None: + LOGGER.info(f"Configuring {self.NAME} benchmark") self.config = config def run(self, backend: Backend) -> None: diff --git a/optimum_benchmark/benchmarks/inference.py b/optimum_benchmark/benchmarks/inference.py index afded80c4..eadbc61c5 100644 --- a/optimum_benchmark/benchmarks/inference.py +++ b/optimum_benchmark/benchmarks/inference.py @@ -1,26 +1,18 @@ +import statistics from dataclasses import dataclass, field -from typing import List, Dict, Optional from logging import getLogger -from omegaconf import OmegaConf - +from typing import Any, Dict, List, Optional +from omegaconf import OmegaConf from pandas import DataFrame -import statistics - from ..backends.base import Backend -from .base import Benchmark, BenchmarkConfig from ..generators.input_generator import InputGenerator -from ..utils import TEXT_GENERATION_TASKS, DIFFUSION_TASKS -from ..trackers.memory import memory_tracker_class_for_backend +from ..task_utils import DIFFUSION_TASKS, TEXT_GENERATION_TASKS from ..trackers.latency import latency_tracker_class_for_backend -from .inference_utils import ( - three_sig_figs, - DEFAULT_INPUT_SHAPES, - DEFAULT_GENERATE_KWARGS, - DEFAULT_DIFUSION_KWARGS, -) - +from ..trackers.memory import memory_tracker_class_for_backend +from .base import Benchmark, BenchmarkConfig +from .utils import three_significant_digits_wrapper LOGGER = getLogger("inference") @@ -33,6 +25,19 @@ lambda task: task in DIFFUSION_TASKS, ) +GENERATE_CONFIG = { + "max_new_tokens": 100, + "min_new_tokens": 100, + "do_sample": False, + "use_cache": True, + "pad_token_id": 0, + "num_beams": 1, +} + +DIFUSION_CONFIG = { + "num_images_per_prompt": 1, +} + @dataclass class InferenceConfig(BenchmarkConfig): @@ -41,14 +46,25 @@ class InferenceConfig(BenchmarkConfig): # benchmark options memory: bool = False - warmup_runs: int = 10 duration: int = 10 - # TODO: deprecate this and use `benchmark.duration` + warmup_runs: int = 10 benchmark_duration: Optional[int] = None # input options input_shapes: Dict = field( - default_factory=lambda: DEFAULT_INPUT_SHAPES, + default_factory=lambda: { + # used with all tasks + "batch_size": 2, + # used with text input tasks + "sequence_length": 16, + # used with multiple choice tasks where input + # is of shape (batch_size, num_choices, sequence_length) + "num_choices": 1, + # used with audio input tasks + "feature_size": 80, + "nb_max_frames": 3000, + "audio_sequence_length": 16000, + }, ) # TODO: deprecate this and use `benchamrk.generate_kwargs` @@ -56,54 +72,40 @@ class InferenceConfig(BenchmarkConfig): # forward options can_diffuse: bool = "${can_diffuse:${task}}" - forward_kwargs: Optional[Dict] = None + forward_kwargs: Dict[str, Any] = field(default_factory=dict) # generation options can_generate: bool = "${can_generate:${task}}" - generate_kwargs: Optional[Dict] = None + generate_kwargs: Dict[str, Any] = field(default_factory=dict) def __post_init__(self): + if self.can_diffuse: + self.forward_kwargs = OmegaConf.to_container(OmegaConf.merge(self.forward_kwargs, DIFUSION_CONFIG)) + if self.can_generate: - self.generate_kwargs = OmegaConf.merge( - self.generate_kwargs or {}, - DEFAULT_GENERATE_KWARGS, - ) + self.generate_kwargs = OmegaConf.to_container(OmegaConf.merge(self.generate_kwargs, GENERATE_CONFIG)) - if self.can_diffuse: - self.forward_kwargs = OmegaConf.merge( - self.forward_kwargs or {}, - DEFAULT_DIFUSION_KWARGS, - ) + if self.generate_kwargs["max_new_tokens"] != self.generate_kwargs["min_new_tokens"]: + raise ValueError("`max_new_tokens` and `min_new_tokens` must be equal for fixed length output.") if self.new_tokens is not None: LOGGER.warning( - "The `new_tokens` option is deprecated, please use `generate_kwargs` " - "instead. `max_new_tokens` and `min_new_tokens` will be set to the " - "value of `new_tokens`." + "The `new_tokens` option is deprecated, please use `generate_kwargs` instead. " + "`generate_kwargs.max_new_tokens` and `generate_kwargs.min_new_tokens` will be set to the value of `new_tokens`." ) self.generate_kwargs["max_new_tokens"] = self.new_tokens self.generate_kwargs["min_new_tokens"] = self.new_tokens - if self.generate_kwargs is not None: - assert ( - self.generate_kwargs["max_new_tokens"] - == self.generate_kwargs["min_new_tokens"] - ), ( - "`max_new_tokens` and `min_new_tokens` " - "must be equal for fixed length output" - ) - - if self.benchmark_duration is not None: + if self.benchmark_duration: LOGGER.warning( - "The `benchmark_duration` option is deprecated, please use `duration` " - "instead. `duration` will be set to the value of `benchmark_duration`." + "The `benchmark_duration` option is deprecated, please use `duration` instead. " + "`duration` will be set to the value of `benchmark_duration`." ) self.duration = self.benchmark_duration -class InferenceBenchmark(Benchmark): - name: str = "inference" - config: InferenceConfig +class InferenceBenchmark(Benchmark[InferenceConfig]): + NAME = "inference" def __init__(self): # initialize inference results @@ -114,12 +116,6 @@ def __init__(self): def configure(self, config: InferenceConfig): super().configure(config) - if self.config.forward_kwargs is None: - self.config.forward_kwargs = {} - - if self.config.generate_kwargs is None: - self.config.generate_kwargs = {} - def run(self, backend: Backend) -> None: LOGGER.info("Running inference benchmark") self.config.input_shapes.update(backend.model_shapes) @@ -130,10 +126,6 @@ def run(self, backend: Backend) -> None: pretrained_config=backend.pretrained_config, ) - if self.config.memory: - # if requested, run memory tracking - self.run_memory_tracking(backend) - # run forward pass tracking self.run_forward_tracking(backend) @@ -141,32 +133,12 @@ def run(self, backend: Backend) -> None: # if possible, run generation pass tracking self.run_generate_tracking(backend) - def run_memory_tracking(self, backend: Backend) -> None: - memory_input = self.input_generator.generate( - mode="forward", - ) - - for key, value in memory_input.items(): - if key == "prompt": - continue - memory_input[key] = value.to(backend.device) - - # for backends that require compilation with static shapes - backend.prepare_for_inference(input_shapes=self.config.input_shapes) - - LOGGER.info("\t+ Tracking forward pass peak memory") - memory_tracker = memory_tracker_class_for_backend[backend.config.name](backend) - with memory_tracker.track(interval=self.config.duration // 100): - _ = backend.forward(memory_input) - - self.forward_peak_memory = memory_tracker.get_peak_memory() - LOGGER.info(f"\t+ Forward pass peak memory: {self.forward_peak_memory} (MB)") - def run_forward_tracking(self, backend: Backend) -> None: forward_input = self.input_generator.generate( mode="forward", ) + # TODO: can be handled by the backend later for key, value in forward_input.items(): if key == "prompt": continue @@ -180,24 +152,30 @@ def run_forward_tracking(self, backend: Backend) -> None: _ = backend.forward(forward_input, **self.config.forward_kwargs) LOGGER.info("\t+ Tracking forward pass latency and throughput") - latency_tracker = latency_tracker_class_for_backend[backend.config.name]( - backend - ) + latency_tracker = latency_tracker_class_for_backend[backend.config.name](backend) while sum(self.forward_latencies) < self.config.duration: with latency_tracker.track(): _ = backend.forward(forward_input, **self.config.forward_kwargs) self.forward_latencies = latency_tracker.get_latencies() LOGGER.info(f"\t+ Forward pass latency: {self.forward_latency:.2e} (s)") - LOGGER.info( - f"\t+ Forward pass throughput: {self.forward_throughput:.2f} (samples/s)" - ) + LOGGER.info(f"\t+ Forward pass throughput: {self.forward_throughput:.2f} (samples/s)") + + if self.config.memory: + LOGGER.info("\t+ Tracking forward pass peak memory") + memory_tracker = memory_tracker_class_for_backend[backend.config.name](backend) + with memory_tracker.track(interval=self.config.duration // 100): + _ = backend.forward(forward_input) + + self.forward_peak_memory = memory_tracker.get_peak_memory() + LOGGER.info(f"\t+ Forward pass peak memory: {self.forward_peak_memory} (MB)") def run_generate_tracking(self, backend: Backend) -> None: generate_input = self.input_generator.generate( - mode="forward", + mode="generate", ) + # TODO: can be handled by the backend later for key, value in generate_input.items(): if key == "prompt": continue @@ -210,9 +188,7 @@ def run_generate_tracking(self, backend: Backend) -> None: ) LOGGER.info("\t+ Tracking generation latency and throughput") - latency_tracker = latency_tracker_class_for_backend[backend.config.name]( - backend - ) + latency_tracker = latency_tracker_class_for_backend[backend.config.name](backend) while sum(self.generate_latencies) < self.config.duration: with latency_tracker.track(): _ = backend.generate( @@ -222,35 +198,33 @@ def run_generate_tracking(self, backend: Backend) -> None: self.generate_latencies = latency_tracker.get_latencies() LOGGER.info(f"\t+ Generation pass latency: {self.generate_latency:.2e} (s)") - - LOGGER.info( - f"\t+ Generation pass throughput: {self.generate_throughput:.2f} (tokens/s)" - ) + LOGGER.info(f"\t+ Generation pass throughput: {self.generate_throughput:.2f} (tokens/s)") # Metrics @property - @three_sig_figs + @three_significant_digits_wrapper def forward_latency(self) -> float: return statistics.mean(self.forward_latencies) @property - @three_sig_figs + @three_significant_digits_wrapper def forward_throughput(self) -> float: - return ( - self.config.input_shapes["batch_size"] - * self.config.forward_kwargs["num_images_per_prompt"] - / self.forward_latency - if self.config.can_diffuse - else self.config.input_shapes["batch_size"] / self.forward_latency - ) + if self.config.can_diffuse: + return ( + self.config.input_shapes["batch_size"] + * self.config.forward_kwargs["num_images_per_prompt"] + / self.forward_latency + ) + else: + return self.config.input_shapes["batch_size"] / self.forward_latency @property - @three_sig_figs + @three_significant_digits_wrapper def generate_latency(self) -> float: return statistics.mean(self.generate_latencies) @property - @three_sig_figs + @three_significant_digits_wrapper def generate_throughput(self) -> float: return ( self.config.generate_kwargs["min_new_tokens"] @@ -259,14 +233,18 @@ def generate_throughput(self) -> float: ) def get_results_df(self) -> DataFrame: - results_dict = dict() + results_dict = {} + + results_dict["forward.latency(s)"] = self.forward_latency + + if self.config.can_diffuse: + results_dict["forward.throughput(images/s)"] = self.forward_throughput + else: + results_dict["forward.throughput(samples/s)"] = self.forward_throughput if self.config.memory: results_dict["forward.peak_memory(MB)"] = self.forward_peak_memory - results_dict["forward.latency(s)"] = self.forward_latency - results_dict["forward.throughput(samples/s)"] = self.forward_throughput - if self.config.can_generate: results_dict["generate.latency(s)"] = self.generate_latency results_dict["generate.throughput(tokens/s)"] = self.generate_throughput diff --git a/optimum_benchmark/benchmarks/inference_utils.py b/optimum_benchmark/benchmarks/inference_utils.py deleted file mode 100644 index b2280cdc3..000000000 --- a/optimum_benchmark/benchmarks/inference_utils.py +++ /dev/null @@ -1,37 +0,0 @@ -DEFAULT_GENERATE_KWARGS = { - "max_new_tokens": 100, - "min_new_tokens": 100, - "do_sample": False, - "use_cache": True, - "pad_token_id": 0, - "num_beams": 1, -} - -DEFAULT_DIFUSION_KWARGS = { - "num_images_per_prompt": 1, -} - -DEFAULT_INPUT_SHAPES = { - # used with all tasks - "batch_size": 2, - # used with text input tasks - "sequence_length": 16, - # used with multiple choice tasks where input - # is of shape (batch_size, num_choices, sequence_length) - "num_choices": 1, - # used with audio input tasks - "feature_size": 80, - "nb_max_frames": 3000, - "audio_sequence_length": 16000, -} - - -def format_float(x: float) -> float: - return float(f"{x:.3g}") - - -def three_sig_figs(func): - def wrapper(*args, **kwargs): - return format_float(func(*args, **kwargs)) - - return wrapper diff --git a/optimum_benchmark/benchmarks/training.py b/optimum_benchmark/benchmarks/training.py index 6ba1ab20b..84b0be949 100644 --- a/optimum_benchmark/benchmarks/training.py +++ b/optimum_benchmark/benchmarks/training.py @@ -1,15 +1,14 @@ -from typing import Any, Dict from dataclasses import dataclass, field from logging import getLogger +from typing import Any, Dict from omegaconf import OmegaConf from pandas import DataFrame from ..backends.base import Backend -from .base import Benchmark, BenchmarkConfig from ..generators.dataset_generator import DatasetGenerator -from .training_utils import MeasurementCallback, get_data_collator - +from .base import Benchmark, BenchmarkConfig +from .utils import MeasurementCallback, get_data_collator LOGGER = getLogger("training") @@ -23,7 +22,7 @@ class TrainingConfig(BenchmarkConfig): _target_: str = "optimum_benchmark.benchmarks.training.TrainingBenchmark" # training options - warmup_steps: int = 2 + warmup_steps: int = 10 # dataset options dataset_shapes: Dict = field( @@ -47,7 +46,8 @@ class TrainingConfig(BenchmarkConfig): default_factory=lambda: { # these are arguments that we set by default # but can be overwritten by the user - "skip_memory_metrics": False, + "skip_memory_metrics": True, + # memory metrics are wrong when using multiple processes "output_dir": "./trainer_output", "use_cpu": "${is_cpu:${device}}", "ddp_find_unused_parameters": False, @@ -58,9 +58,8 @@ class TrainingConfig(BenchmarkConfig): ) -class TrainingBenchmark(Benchmark): - name: str = "training" - config: TrainingConfig +class TrainingBenchmark(Benchmark[TrainingConfig]): + NAME = "training" def __init__(self): # initialize training results @@ -88,14 +87,14 @@ def run(self, backend: "Backend") -> None: self.training_metrics = { # warmup metrics - "warmup_runtime": trainer_state.warmup_runtime, - "warmup_throughput()": trainer_state.warmup_samples_per_second, + "warmup.runtime(s)": trainer_state.warmup_runtime, + "warmup.throughput(samples/s)": trainer_state.warmup_samples_per_second, # training metrics - "train_runtime": trainer_state.train_runtime, - "training_throughput": trainer_state.train_samples_per_second, + "training.runtime(s)": trainer_state.training_runtime, + "training.throughput(samples/s)": trainer_state.training_samples_per_second, # overall training metrics - "overall_train_runtime": trainer_state.overall_train_runtime, - "overall_training_throughput": trainer_state.overall_train_samples_per_second, + "overall_training.runtime(s)": trainer_state.overall_training_runtime, + "overall_training.throughput(samles/s)": (trainer_state.overall_training_samples_per_second), } def get_results_df(self) -> DataFrame: diff --git a/optimum_benchmark/benchmarks/training_utils.py b/optimum_benchmark/benchmarks/training_utils.py deleted file mode 100644 index 097e06c22..000000000 --- a/optimum_benchmark/benchmarks/training_utils.py +++ /dev/null @@ -1,103 +0,0 @@ -from typing import Any, Dict, TYPE_CHECKING -from dataclasses import dataclass -import time - -from transformers import default_data_collator -from transformers import TrainerCallback - -if TYPE_CHECKING: - from transformers import TrainerState, TrainingArguments, TrainerControl - - -@dataclass -class MeasurementCallback(TrainerCallback): - warmup_steps: int - - def on_train_begin( - self, - args: "TrainingArguments", - state: "TrainerState", - control: "TrainerControl", - **kwargs, - ): - if state.max_steps <= self.warmup_steps: - # This check is here because max_steps is set only once the training - # is launched, thus we can not check before calling trainer.train(). - raise ValueError( - f"Total training steps {state.max_steps} is smaller " - "than the number of warmup steps {self.warmup_steps}. " - "Please increase the total number of steps (for example by " - "increasing the dataset size)." - ) - - state.warmup_start = time.time_ns() * 1e-9 - state.overall_train_start = time.time_ns() * 1e-9 - - def on_step_begin( - self, - args: "TrainingArguments", - state: "TrainerState", - control: "TrainerControl", - **kwargs, - ): - if state.global_step == self.warmup_steps: - state.warmup_end = time.time_ns() * 1e-9 - state.training_start = time.time_ns() * 1e-9 - elif state.global_step > state.max_steps - 1: - raise ValueError("global_step > state.max_steps - 1") - - def on_train_end( - self, - args: "TrainingArguments", - state: "TrainerState", - control: "TrainerControl", - **kwargs, - ): - state.training_end = time.time_ns() * 1e-9 - state.overall_train_end = time.time_ns() * 1e-9 - - state.total_train_batch_size = ( - args.train_batch_size * args.gradient_accumulation_steps * args.world_size - ) - - # warmup metrics - state.warmup_runtime = state.warmup_end - state.warmup_start - state.num_warmup_samples = self.warmup_steps * state.total_train_batch_size - state.warmup_samples_per_second = ( - state.num_warmup_samples / state.warmup_runtime - ) - # state.warmup_steps_per_second = self.warmup_steps / state.warmup_runtime - - # training metrics - state.train_runtime = state.training_end - state.training_start - state.num_train_steps = state.max_steps - self.warmup_steps - state.num_train_samples = state.num_train_steps * state.total_train_batch_size - state.train_samples_per_second = state.num_train_samples / state.train_runtime - # state.train_steps_per_second = state.num_train_steps / state.train_runtime - - # overall training metrics - state.overall_train_runtime = state.training_end - state.warmup_start - state.overall_train_samples_per_second = ( - state.num_train_samples / state.overall_train_runtime - ) - # state.overall_train_steps_per_second = ( - # state.num_train_steps / state.overall_train_runtime - # ) - - -def get_data_collator(task: str) -> callable: - if task == "object-detection": - return object_detection_data_collator - else: - return default_data_collator - - -def object_detection_data_collator(batch) -> Dict[str, Any]: - import torch - - pixel_values = torch.stack([example["pixel_values"] for example in batch]) - labels = [example["labels"] for example in batch] - return { - "pixel_values": pixel_values, - "labels": labels, - } diff --git a/optimum_benchmark/benchmarks/utils.py b/optimum_benchmark/benchmarks/utils.py new file mode 100644 index 000000000..973274303 --- /dev/null +++ b/optimum_benchmark/benchmarks/utils.py @@ -0,0 +1,87 @@ +import time +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Callable, Dict + +from transformers import TrainerCallback, default_data_collator + +if TYPE_CHECKING: + from transformers import TrainerControl, TrainerState, TrainingArguments + + +def extract_three_significant_digits(x: float) -> float: + return float(f"{x:.3g}") + + +def three_significant_digits_wrapper(func: Callable[..., float]) -> Callable[..., float]: + def wrapper(*args, **kwargs): + return extract_three_significant_digits(func(*args, **kwargs)) + + return wrapper + + +@dataclass +class MeasurementCallback(TrainerCallback): + warmup_steps: int + + def on_train_begin(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + if state.max_steps <= self.warmup_steps: + # This check is here because max_steps is set only once the training + # is launched, thus we can not check before calling trainer.train(). + raise ValueError( + f"Total training steps {state.max_steps} is smaller " + "than the number of warmup steps {self.warmup_steps}. " + "Please increase the total number of steps (for example by " + "increasing the dataset size)." + ) + + state.warmup_start = time.time_ns() * 1e-9 + state.overall_training_start = time.time_ns() * 1e-9 + + def on_step_begin(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + if state.global_step == self.warmup_steps: + state.warmup_end = time.time_ns() * 1e-9 + state.training_start = time.time_ns() * 1e-9 + elif state.global_step > state.max_steps - 1: + raise ValueError("global_step > state.max_steps - 1") + + def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + state.training_end = time.time_ns() * 1e-9 + state.overall_training_end = time.time_ns() * 1e-9 + + state.total_training_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size + + # warmup metrics + state.warmup_runtime = state.warmup_end - state.warmup_start + state.num_warmup_samples = self.warmup_steps * state.total_training_batch_size + state.warmup_samples_per_second = state.num_warmup_samples / state.warmup_runtime + state.warmup_steps_per_second = self.warmup_steps / state.warmup_runtime + + # training metrics + state.training_runtime = state.training_end - state.training_start + state.num_training_steps = state.max_steps - self.warmup_steps + state.num_training_samples = state.num_training_steps * state.total_training_batch_size + state.training_samples_per_second = state.num_training_samples / state.training_runtime + state.training_steps_per_second = state.num_training_steps / state.training_runtime + + # overall training metrics + state.overall_training_runtime = state.training_end - state.warmup_start + state.overall_training_samples_per_second = state.num_training_samples / state.overall_training_runtime + state.overall_training_steps_per_second = state.num_training_steps / state.overall_training_runtime + + +def get_data_collator(task: str) -> callable: + if task == "object-detection": + return object_detection_data_collator + else: + return default_data_collator + + +def object_detection_data_collator(batch) -> Dict[str, Any]: + import torch + + pixel_values = torch.stack([example["pixel_values"] for example in batch]) + labels = [example["labels"] for example in batch] + return { + "pixel_values": pixel_values, + "labels": labels, + } diff --git a/optimum_benchmark/env_utils.py b/optimum_benchmark/env_utils.py new file mode 100644 index 000000000..dd496fb49 --- /dev/null +++ b/optimum_benchmark/env_utils.py @@ -0,0 +1,38 @@ +import platform +import re +import subprocess +from logging import getLogger +from typing import Optional + +import psutil + +LOGGER = getLogger("utils") + + +def bytes_to_mega_bytes(bytes: int) -> int: + # Reference: https://en.wikipedia.org/wiki/Byte#Multiple-byte_units + return int(bytes * 1e-6) + + +def get_cpu() -> Optional[str]: + if platform.system() == "Windows": + return platform.processor() + + elif platform.system() == "Darwin": + command = "sysctl -n machdep.cpu.brand_string" + return str(subprocess.check_output(command).strip()) + + elif platform.system() == "Linux": + command = "cat /proc/cpuinfo" + all_info = subprocess.check_output(command, shell=True).decode().strip() + for line in all_info.split("\n"): + if "model name" in line: + return re.sub(".*model name.*:", "", line, 1) + return "Could not find device name" + + else: + raise ValueError(f"Unknown system '{platform.system()}'") + + +def get_cpu_ram_mb(): + return bytes_to_mega_bytes(psutil.virtual_memory().total) diff --git a/optimum_benchmark/experiment.py b/optimum_benchmark/experiment.py index a33f1026b..85253f80d 100644 --- a/optimum_benchmark/experiment.py +++ b/optimum_benchmark/experiment.py @@ -1,31 +1,28 @@ import os import platform -from typing import Any, Type, Dict +from dataclasses import dataclass, field from logging import getLogger -from dataclasses import dataclass, MISSING, field +from typing import Any, Dict, Type import hydra +from accelerate import __version__ as accelerate_version +from diffusers import __version__ as diffusers_version +from hydra.core.config_store import ConfigStore from hydra.utils import get_class +from omegaconf import DictConfig, OmegaConf, SCMode from optimum.exporters import TasksManager -from omegaconf import DictConfig, OmegaConf -from hydra.core.config_store import ConfigStore -from diffusers import __version__ as diffusers_version -from accelerate import __version__ as accelerate_version from optimum.version import __version__ as optimum_version from transformers import __version__ as transformers_version -from .import_utils import ( - is_torch_available, - is_onnxruntime_available, - is_openvino_available, - is_neural_compressor_available, -) from .backends.base import Backend +from .backends.neural_compressor.config import INCConfig +from .backends.onnxruntime.config import ORTConfig +from .backends.openvino.config import OVConfig +from .backends.pytorch.config import PyTorchConfig from .benchmarks.base import Benchmark -from .utils import get_cpu, get_cpu_ram_mb -from .benchmarks.training import TrainingConfig from .benchmarks.inference import InferenceConfig - +from .benchmarks.training import TrainingConfig +from .env_utils import get_cpu, get_cpu_ram_mb LOGGER = getLogger("experiment") @@ -49,13 +46,13 @@ class ExperimentConfig: benchmark: Any # https://github.com/facebookresearch/hydra/issues/1722#issuecomment-883568386 # EXPERIMENT CONFIGURATION - experiment_name: str = MISSING + experiment_name: str # Model name or path (bert-base-uncased, google/vit-base-patch16-224, ...) - model: str = MISSING + model: str # Device name or path (cpu, cuda, cuda:0, ...) - device: str = MISSING + device: str # Task name (text-classification, image-classification, ...) - task: str = "${infer_task:${model}, ${hub_kwargs.revision}}" + task: str = "${infer_task:${model},${hub_kwargs.revision}}" # ADDITIONAL MODEL CONFIGURATION: Model revision, use_auth_token, trust_remote_code hub_kwargs: Dict = field( @@ -68,6 +65,7 @@ class ExperimentConfig: ) # ENVIRONMENT CONFIGURATION + # TODO: add gpu info when available environment: Dict = field( default_factory=lambda: { "optimum_version": optimum_version, @@ -86,38 +84,17 @@ class ExperimentConfig: # Register configurations cs = ConfigStore.instance() cs.store(name="experiment", node=ExperimentConfig) - -if is_torch_available(): - from optimum_benchmark.backends.pytorch import PyTorchConfig - - cs.store(group="backend", name="pytorch", node=PyTorchConfig) - -if is_onnxruntime_available(): - from optimum_benchmark.backends.onnxruntime import ORTConfig - - cs.store(group="backend", name="onnxruntime", node=ORTConfig) - -if is_openvino_available(): - from optimum_benchmark.backends.openvino import OVConfig - - cs.store(group="backend", name="openvino", node=OVConfig) - -if is_neural_compressor_available(): - from optimum_benchmark.backends.neural_compressor import INCConfig - - cs.store(group="backend", name="neural_compressor", node=INCConfig) - +cs.store(group="backend", name="pytorch", node=PyTorchConfig) +cs.store(group="backend", name="onnxruntime", node=ORTConfig) +cs.store(group="backend", name="openvino", node=OVConfig) +cs.store(group="backend", name="neural_compressor", node=INCConfig) cs.store(group="benchmark", name="inference", node=InferenceConfig) cs.store(group="benchmark", name="training", node=TrainingConfig) @hydra.main(version_base=None) def run_experiment(experiment: DictConfig) -> None: - from omegaconf import SCMode - - experiment = OmegaConf.to_container( - experiment, structured_config_mode=SCMode.INSTANTIATE - ) + experiment = OmegaConf.to_container(experiment, structured_config_mode=SCMode.INSTANTIATE, resolve=True) # Save the config OmegaConf.save(experiment, "hydra_config.yaml", resolve=True) @@ -130,21 +107,23 @@ def run_experiment(experiment: DictConfig) -> None: # Allocate requested backend backend_factory: Type[Backend] = get_class(experiment.backend._target_) backend: Backend = backend_factory( - experiment.model, - experiment.task, - experiment.device, - experiment.hub_kwargs, + task=experiment.task, + model=experiment.model, + device=experiment.device, + hub_kwargs=experiment.hub_kwargs, ) try: + # Configure the backend backend.configure(experiment.backend) - + # Run the benchmark benchmark.run(backend) # Save the benchmark results benchmark.save() - + # Clean up the backend backend.clean() + except Exception as e: - LOGGER.error("Error during benchmarking: %s", e) + LOGGER.error("Error during experiment: %s", e) backend.clean() raise e diff --git a/optimum_benchmark/generators/dataset_generator.py b/optimum_benchmark/generators/dataset_generator.py index 0d5f00e68..e6a9df36f 100644 --- a/optimum_benchmark/generators/dataset_generator.py +++ b/optimum_benchmark/generators/dataset_generator.py @@ -8,18 +8,13 @@ TaskGenerator, ) - LOGGER = getLogger("dataset_generator") class DatasetGenerator: task_generator: TaskGenerator - def __init__( - self, - task: str, - dataset_shapes: Dict[str, int], - ): + def __init__(self, task: str, dataset_shapes: Dict[str, int]): dataset_shapes["batch_size"] = dataset_shapes.pop("dataset_size") if task in TASKS_TO_GENERATORS: @@ -40,7 +35,7 @@ def generate(self) -> Dataset: task_dataset = self.task_generator.generate() task_dataset = Dataset.from_dict(task_dataset) task_dataset.set_format( - type="numpy", + type="torch", # for now we're using pytorch tensors columns=list(task_dataset.features.keys()), ) diff --git a/optimum_benchmark/generators/input_generator.py b/optimum_benchmark/generators/input_generator.py index f384abb23..f9858dac3 100644 --- a/optimum_benchmark/generators/input_generator.py +++ b/optimum_benchmark/generators/input_generator.py @@ -1,9 +1,9 @@ -from typing import Dict, List, Union, Optional, TYPE_CHECKING from logging import getLogger +from typing import TYPE_CHECKING, Dict, List, Optional, Union if TYPE_CHECKING: - from transformers import PretrainedConfig import torch + from transformers import PretrainedConfig from optimum_benchmark.generators.model_type_generator import ( SUPPURTED_MODEL_TYPES, @@ -14,32 +14,28 @@ TaskGenerator, ) - LOGGER = getLogger("input_generator") class InputGenerator: - model_type_generator: Optional[ModelTypeGenerator] = None - task_generator: Optional[TaskGenerator] = None + model_type_generator: Optional[ModelTypeGenerator] + task_generator: Optional[TaskGenerator] def __init__( - self, - task: str, - input_shapes: Dict[str, int], - # for model_type_generator - pretrained_config: Optional["PretrainedConfig"] = None, + self, task: str, input_shapes: Dict[str, int], pretrained_config: Optional["PretrainedConfig"] = None ): - if pretrained_config is not None: + if pretrained_config is not None and pretrained_config.model_type in SUPPURTED_MODEL_TYPES: + self.used_generator = "model_type" model_type = pretrained_config.model_type - if ModelTypeGenerator.check_model_type_support(model_type): - LOGGER.info(f"Using {model_type} model type generator") - self.model_type_generator = ModelTypeGenerator( - task=task, - model_type=model_type, - shapes=input_shapes, - pretrained_config=pretrained_config, - ) + LOGGER.info(f"Using {model_type} model type generator") + self.model_type_generator = ModelTypeGenerator( + task=task, + model_type=model_type, + shapes=input_shapes, + pretrained_config=pretrained_config, + ) elif task in TASKS_TO_GENERATORS: + self.used_generator = "task" LOGGER.info(f"Using {task} task generator") self.task_generator = TASKS_TO_GENERATORS[task]( shapes=input_shapes, @@ -59,18 +55,13 @@ def __init__( # TODO: we can drop the torch dependency here by returning a dict of numpy arrays # and then converting them to torch tensors in backend.prepare_for_inference def generate(self, mode: str) -> Dict[str, Union["torch.Tensor", List[str]]]: - if self.model_type_generator is not None: + if self.used_generator == "model_type": dummy_input = self.model_type_generator.generate() - elif self.task_generator is not None: + elif self.used_generator == "task": dummy_input = self.task_generator.generate() if mode == "generate": - if "input_ids" in dummy_input: - # text input - dummy_input = { - "input_ids": dummy_input["input_ids"], - } - elif "pixel_values" in dummy_input: + if "pixel_values" in dummy_input: # image input dummy_input = { "pixel_values": dummy_input["pixel_values"], @@ -85,5 +76,10 @@ def generate(self, mode: str) -> Dict[str, Union["torch.Tensor", List[str]]]: dummy_input = { "input_features": dummy_input["input_features"], } + elif "input_ids" in dummy_input: + # text input + dummy_input = { + "input_ids": dummy_input["input_ids"], + } return dummy_input diff --git a/optimum_benchmark/generators/model_type_generator.py b/optimum_benchmark/generators/model_type_generator.py index 8ca800ac9..d06b512d7 100644 --- a/optimum_benchmark/generators/model_type_generator.py +++ b/optimum_benchmark/generators/model_type_generator.py @@ -1,9 +1,8 @@ -from typing import Dict, List from logging import getLogger +from typing import Dict, List -from transformers import PretrainedConfig from optimum.exporters.tasks import TasksManager - +from transformers import PretrainedConfig LOGGER = getLogger("model_type_generator") @@ -11,8 +10,8 @@ class ModelTypeGenerator: - """ - A wrapper around optimum's TasksManager to generate dummy inputs for a given model type. + """A wrapper around optimum's TasksManager to generate dummy inputs + for a given model type. """ def __init__( @@ -30,29 +29,5 @@ def __init__( model_type=model_type, )(pretrained_config) - @staticmethod - def check_model_type_support(model_type: str) -> bool: - return model_type in SUPPURTED_MODEL_TYPES - def generate(self) -> Dict[str, int]: return self.onnx_config.generate_dummy_inputs(framework="pt", **self.shapes) - - -if __name__ == "__main__": - from transformers import AutoConfig - - pretrained_config = AutoConfig.from_pretrained("gpt2") - - assert ModelTypeGenerator.check_model_type_support("gpt2") - - model_input_generator = ModelTypeGenerator( - task="text-generation", - model_type="gpt2", - shapes={ - "batch_size": 1, - "sequence_length": 100, - }, - pretrained_config=pretrained_config, - ) - - print(model_input_generator.generate()) diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py index ec6ad9f62..c63ab565f 100644 --- a/optimum_benchmark/generators/task_generator.py +++ b/optimum_benchmark/generators/task_generator.py @@ -1,10 +1,9 @@ +from abc import ABC from logging import getLogger from typing import Tuple -from abc import ABC import torch - LOGGER = getLogger("task_generator") @@ -377,27 +376,3 @@ def generate(self): "stable-diffusion": PromptGenerator, "stable-diffusion-xl": PromptGenerator, } - - -if __name__ == "__main__": - all_shapes = { - "batch_size": 1, - "sequence_length": 16, - "num_choices": 2, - "feature_size": 80, - "nb_max_frames": 3000, - "audio_sequence_length": 16000, - "height": 224, - "width": 224, - "num_labels": 2, - "num_queries": 2, - "vocab_size": 100, - "type_vocab_size": 2, - "num_channels": 3, - } - - for task in TASKS_TO_GENERATORS: - task_input_generator = TASKS_TO_GENERATORS[task]( - shapes=all_shapes, with_labels=True - ) - print(task_input_generator.generate()) diff --git a/optimum_benchmark/import_utils.py b/optimum_benchmark/import_utils.py index cc97c5397..13dee6171 100644 --- a/optimum_benchmark/import_utils.py +++ b/optimum_benchmark/import_utils.py @@ -3,9 +3,7 @@ _torch_available = importlib.util.find_spec("torch") is not None _onnxruntime_available = importlib.util.find_spec("onnxruntime") is not None _is_openvino_available = importlib.util.find_spec("openvino") is not None -_is_neural_compressor_available = ( - importlib.util.find_spec("neural_compressor") is not None -) +_is_neural_compressor_available = importlib.util.find_spec("neural_compressor") is not None def is_torch_available(): diff --git a/optimum_benchmark/profilers/fx_profiler.py b/optimum_benchmark/profilers/fx_profiler.py index 4d972abd4..8a72f2eba 100644 --- a/optimum_benchmark/profilers/fx_profiler.py +++ b/optimum_benchmark/profilers/fx_profiler.py @@ -1,14 +1,12 @@ -from typing import Any, List, Tuple -from logging import getLogger -import torch import time +from logging import getLogger +from typing import Any, List, Tuple - -from torch.fx.graph_module import GraphModule +import torch from torch.fx import Interpreter +from torch.fx.graph_module import GraphModule from torch.fx.node import Node - LOGGER = getLogger("fx_profiler") @@ -18,8 +16,7 @@ def __init__(self, module: GraphModule): self.profiling_records: List[Tuple[str, str, float]] = [] def run(self, *args) -> Any: - return_val = super().run(*args) - return return_val + return super().run(*args) def run_node(self, node: Node) -> Any: if self.module.device.type == "cuda": diff --git a/optimum_benchmark/profilers/ort_profiler.py b/optimum_benchmark/profilers/ort_profiler.py index d7f555e26..030dda323 100644 --- a/optimum_benchmark/profilers/ort_profiler.py +++ b/optimum_benchmark/profilers/ort_profiler.py @@ -1,12 +1,10 @@ -from typing import List, Tuple -from logging import getLogger -import pandas as pd import json +from logging import getLogger +from typing import List, Tuple - +import pandas as pd from optimum.onnxruntime import ORTModel - LOGGER = getLogger("ort_profiler") @@ -26,9 +24,7 @@ def get_profiling_records(self) -> List[Tuple[str, str, float]]: profiling_data = profiling_data["traceEvents"] profiling_records = extract_last_run_records(profiling_data) - profiling_records = normalize_records(profiling_records) - - return profiling_records + return normalize_records(profiling_records) def normalize_records(data) -> List[Tuple[str, str, float]]: diff --git a/optimum_benchmark/report.py b/optimum_benchmark/report.py index 9e12d299e..20a2ac286 100644 --- a/optimum_benchmark/report.py +++ b/optimum_benchmark/report.py @@ -1,30 +1,27 @@ -import pandas as pd -import seaborn as sns +from argparse import ArgumentParser from pathlib import Path -from pandas import DataFrame + import matplotlib.pyplot as plt -from omegaconf import OmegaConf +import pandas as pd +import seaborn as sns from flatten_dict import flatten -from argparse import ArgumentParser - -from rich.table import Table +from omegaconf import OmegaConf +from pandas import DataFrame from rich.console import Console +from rich.table import Table from rich.terminal_theme import MONOKAI def gather_inference_report(root_folder: Path) -> DataFrame: # key is path to inference file as string, value is dataframe inference_dfs = { - f.parent.absolute().as_posix(): pd.read_csv(f) - for f in root_folder.glob("**/inference_results.csv") + f.parent.absolute().as_posix(): pd.read_csv(f) for f in root_folder.glob("**/inference_results.csv") } # key is path to config file as string, value is flattened dict config_dfs = { f.parent.absolute() - .as_posix(): pd.DataFrame.from_dict( - flatten(OmegaConf.load(f), reducer="dot"), orient="index" - ) + .as_posix(): pd.DataFrame.from_dict(flatten(OmegaConf.load(f), reducer="dot"), orient="index") .T for f in root_folder.glob("**/hydra_config.yaml") if f.parent.absolute().as_posix() in inference_dfs.keys() @@ -35,8 +32,7 @@ def gather_inference_report(root_folder: Path) -> DataFrame: # Merge inference and config dataframes inference_reports = [ - config_dfs[name].merge(inference_dfs[name], left_index=True, right_index=True) - for name in inference_dfs.keys() + config_dfs[name].merge(inference_dfs[name], left_index=True, right_index=True) for name in inference_dfs.keys() ] # Concatenate all reports @@ -82,9 +78,7 @@ def format_row(row, style=""): return formated_row -def get_inference_rich_table( - inference_report, with_baseline=False, with_generate=False, title="" -): +def get_inference_rich_table(inference_report, with_baseline=False, with_generate=False, title=""): perf_columns = [ "forward.latency(s)", "forward.throughput(samples/s)", @@ -107,17 +101,12 @@ def get_inference_rich_table( additional_columns = [ col for col in inference_report.columns - if inference_report[col].nunique() > 1 - and "backend" in col - and "_target_" not in col - and "version" not in col + if inference_report[col].nunique() > 1 and "backend" in col and "_target_" not in col and "version" not in col ] # display interesting columns in multilevel hierarchy display_report = inference_report[additional_columns + perf_columns] - display_report.columns = pd.MultiIndex.from_tuples( - [tuple(col.split(".")) for col in display_report.columns] - ) + display_report.columns = pd.MultiIndex.from_tuples([tuple(col.split(".")) for col in display_report.columns]) # create rich table rich_table = Table(show_header=True, title=title, show_lines=True) @@ -177,9 +166,7 @@ def get_inference_plots(report, with_baseline=False, with_generate=False, subtit ax=ax2, width=0.5, ) - ax2.set_xticklabels( - ax2.get_xticklabels(), rotation=45, horizontalalignment="right" - ) + ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, horizontalalignment="right") ax2.set_xlabel("Experiment") ax2.set_ylabel("Generate Throughput (tokens/s)") ax2.set_title("Generate Throughput by Experiment" + "\n" + subtitle) @@ -199,9 +186,7 @@ def get_inference_plots(report, with_baseline=False, with_generate=False, subtit if with_generate: # add speedup text on top of each bar - baseline_generate_throughput = report["generate.throughput(tokens/s)"].iloc[ - -1 - ] + baseline_generate_throughput = report["generate.throughput(tokens/s)"].iloc[-1] for p in ax2.patches: speedup = (p.get_height() / baseline_generate_throughput - 1) * 100 ax2.annotate( @@ -210,9 +195,7 @@ def get_inference_plots(report, with_baseline=False, with_generate=False, subtit ha="center", va="center", ) - ax2.set_title( - "Generate Throughput and Speedup by Experiment" + "\n" + subtitle - ) + ax2.set_title("Generate Throughput and Speedup by Experiment" + "\n" + subtitle) return fig1, fig2 @@ -220,16 +203,12 @@ def get_inference_plots(report, with_baseline=False, with_generate=False, subtit def compute_speedup(report, with_generate=False): # compute speedup for each experiment compared to baseline report["forward.speedup(%)"] = ( - report["forward.throughput(samples/s)"] - / report["forward.throughput(samples/s)"].iloc[-1] - - 1 + report["forward.throughput(samples/s)"] / report["forward.throughput(samples/s)"].iloc[-1] - 1 ) * 100 if with_generate: report["generate.speedup(%)"] = ( - report["generate.throughput(tokens/s)"] - / report["generate.throughput(tokens/s)"].iloc[-1] - - 1 + report["generate.throughput(tokens/s)"] / report["generate.throughput(tokens/s)"].iloc[-1] - 1 ) * 100 return report @@ -267,15 +246,11 @@ def generate_report(): report_name = args.report_name # gather experiments reports - inference_experiments = [ - gather_inference_report(experiment) for experiment in experiments_folders - ] + inference_experiments = [gather_inference_report(experiment) for experiment in experiments_folders] inference_report = pd.concat(inference_experiments, axis=0) # sort by forward throughput - inference_report.sort_values( - by="forward.throughput(samples/s)", ascending=False, inplace=True - ) + inference_report.sort_values(by="forward.throughput(samples/s)", ascending=False, inplace=True) # some flags with_baseline = baseline_folder is not None @@ -284,9 +259,7 @@ def generate_report(): if with_baseline: # gather baseline report inference_baseline = gather_inference_report(baseline_folder) - assert ( - inference_baseline.shape[0] == 1 - ), "baseline folder should contain only one experiment" + assert inference_baseline.shape[0] == 1, "baseline folder should contain only one experiment" # add baseline to experiment inference_report = pd.concat([inference_report, inference_baseline], axis=0) # compute speedup compared to baseline @@ -302,17 +275,13 @@ def generate_report(): Path(reporting_directory).mkdir(exist_ok=True, parents=True) # rich table - rich_table = get_inference_rich_table( - inference_report, with_baseline, with_generate, report_name - ) + rich_table = get_inference_rich_table(inference_report, with_baseline, with_generate, report_name) console = Console(record=True) console.print(rich_table, justify="left", no_wrap=True) console.save_svg(f"{reporting_directory}/rich_table.svg", theme=MONOKAI) # plots - forward_fig, generate_fig = get_inference_plots( - inference_report, with_baseline, with_generate, report_name - ) + forward_fig, generate_fig = get_inference_plots(inference_report, with_baseline, with_generate, report_name) forward_fig.tight_layout() forward_fig.savefig(f"{reporting_directory}/forward_throughput.png") diff --git a/optimum_benchmark/task_utils.py b/optimum_benchmark/task_utils.py new file mode 100644 index 000000000..de2c95f45 --- /dev/null +++ b/optimum_benchmark/task_utils.py @@ -0,0 +1,39 @@ +DIFFUSION_TASKS = [ + "stable-diffusion", + "stable-diffusion-xl", +] + +TEXT_GENERATION_TASKS = [ + "image-to-text", + "text-generation", + "text2text-generation", + "automatic-speech-recognition", +] + +# let's leave this here for now, it's a good list of tasks supported by transformers +ALL_TASKS = [ + "conversational", + "feature-extraction", + "fill-mask", + "text-generation", + "text2text-generation", + "text-classification", + "token-classification", + "multiple-choice", + "object-detection", + "question-answering", + "image-classification", + "image-segmentation", + "mask-generation", + "masked-im", + "semantic-segmentation", + "automatic-speech-recognition", + "audio-classification", + "audio-frame-classification", + "audio-xvector", + "image-to-text", + "stable-diffusion", + "stable-diffusion-xl", + "zero-shot-image-classification", + "zero-shot-object-detection", +] diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py index 98dc93067..d742a98e1 100644 --- a/optimum_benchmark/trackers/latency.py +++ b/optimum_benchmark/trackers/latency.py @@ -1,9 +1,9 @@ +import time from contextlib import contextmanager from logging import getLogger from typing import List -import torch -import time +import torch LOGGER = getLogger("latency_tracker") @@ -59,9 +59,7 @@ def __init__(self, backend): self.hf_device_map = None self.end_device = self.device if self.device.type == "cuda": - self.device_indexes = { - self.device.index if self.device.index is not None else 0 - } + self.device_indexes = {self.device.index if self.device.index is not None else 0} def _cuda_latency(self): start_event = torch.cuda.Event(enable_timing=True) diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py index c126321a6..b8aa1a199 100644 --- a/optimum_benchmark/trackers/memory.py +++ b/optimum_benchmark/trackers/memory.py @@ -1,13 +1,13 @@ -from multiprocessing.connection import Connection -from multiprocessing import Pipe, Process +import os from contextlib import contextmanager from logging import getLogger +from multiprocessing import Pipe, Process +from multiprocessing.connection import Connection + import psutil import torch -import os - -from optimum_benchmark.utils import bytes_to_mega_bytes +from ..env_utils import bytes_to_mega_bytes LOGGER = getLogger("memory_tracker") @@ -32,15 +32,13 @@ def _track_cuda_peak_memory(self): nvml.nvmlInit() handle = nvml.nvmlDeviceGetHandleByIndex( - self.device.index - if self.device.index is not None - else torch.cuda.current_device() + self.device.index if self.device.index is not None else torch.cuda.current_device() ) yield meminfo = nvml.nvmlDeviceGetMemoryInfo(handle) nvml.nvmlShutdown() - # At least for PyTorch, relying on meminfo.used is fine + # At least for PyTorch, relying on meminfo.used is fine # here as PyTorch does not deallocate its cache after running forward. self.peak_memory = max(self.peak_memory, meminfo.used) LOGGER.debug(f"Peak memory usage: {self.get_peak_memory()} MB") @@ -48,9 +46,7 @@ def _track_cuda_peak_memory(self): def _track_cpu_peak_memory(self, interval: float): child_connection, parent_connection = Pipe() # instantiate process - mem_process: Process = PeakMemoryMeasureProcess( - os.getpid(), child_connection, interval - ) + mem_process: Process = PeakMemoryMeasureProcess(os.getpid(), child_connection, interval) mem_process.start() # wait until we get memory parent_connection.recv() @@ -76,9 +72,7 @@ def run(self): while True: process = psutil.Process(self.process_id) - meminfo_attr = ( - "memory_info" if hasattr(process, "memory_info") else "get_memory_info" - ) + meminfo_attr = "memory_info" if hasattr(process, "memory_info") else "get_memory_info" memory = getattr(process, meminfo_attr)()[0] self.mem_usage = max(self.mem_usage, memory) @@ -99,9 +93,7 @@ def __init__(self, backend): self.hf_device_map = backend.pretrained_model.hf_device_map self.device_indexes = set(self.hf_device_map.values()) else: - self.device_indexes = { - self.device.index if self.device.index is not None else 0 - } + self.device_indexes = {self.device.index if self.device.index is not None else 0} # This variable is used only when CUDA device is used. self.peak_per_device = [0 for _ in range(len(self.device_indexes))] diff --git a/optimum_benchmark/utils.py b/optimum_benchmark/utils.py deleted file mode 100644 index 001c2f38f..000000000 --- a/optimum_benchmark/utils.py +++ /dev/null @@ -1,195 +0,0 @@ -from typing import Optional, List -from logging import getLogger -import subprocess -import platform -import random -import signal -import time -import re -import os - -import numpy as np -import psutil - -LOGGER = getLogger("utils") - - -def set_seed(seed: int) -> None: - random.seed(seed) - np.random.seed(seed) - os.environ["PYTHONHASHSEED"] = str(seed) - - -def bytes_to_mega_bytes(bytes: int) -> int: - # Reference: https://en.wikipedia.org/wiki/Byte#Multiple-byte_units - return int(bytes * 1e-6) - - -def get_cpu() -> Optional[str]: - if platform.system() == "Windows": - return platform.processor() - - elif platform.system() == "Darwin": - os.environ["PATH"] = os.environ["PATH"] + os.pathsep + "/usr/sbin" - command = "sysctl -n machdep.cpu.brand_string" - return str(subprocess.check_output(command).strip()) - - elif platform.system() == "Linux": - command = "cat /proc/cpuinfo" - all_info = subprocess.check_output(command, shell=True).decode().strip() - for line in all_info.split("\n"): - if "model name" in line: - return re.sub(".*model name.*:", "", line, 1) - return "Could not find device name" - - else: - raise ValueError(f"Unknown system '{platform.system()}'") - - -def get_cpu_ram_mb(): - return bytes_to_mega_bytes(psutil.virtual_memory().total) - - -def check_no_process_is_running_on_cuda_device(device_ids: List[int]) -> None: - """ - Raises a RuntimeError if any process is running on the given cuda device. - """ - - for device_id in device_ids: - # get list of all PIDs running on nvidia devices - pids = [ - int(pid) - for pid in subprocess.check_output( - ["nvidia-smi", "--query-compute-apps=pid", "--format=csv,noheader"] - ) - .decode() - .strip() - .split("\n") - if pid != "" - ] - - # get list of PIDs running on cuda device_id - pids_on_device_id = set( - [ - pid - for pid in pids - if subprocess.check_output( - [ - "nvidia-smi", - "--query-compute-apps=pid,used_memory", - "--format=csv,noheader,nounits", - f"--id={device_id}", - ] - ) - .decode() - .startswith(f"{pid},") - ] - ) - - # TODO: It would be safer to run each run of a sweep in a subprocess. - # Although we can trust PyTorch to clear GPU memory when asked, - # it is not a safe assumption to make for all backends. - if len(pids_on_device_id) > 1 or ( - len(pids_on_device_id) == 1 and os.getpid() not in pids_on_device_id - ): - raise RuntimeError( - f"Expected no processes on device {device_id}, " - f"found {len(pids_on_device_id)} processes " - f"with PIDs {pids_on_device_id}." - ) - - -def check_only_this_process_is_running_on_cuda_device( - device_ids: List[int], pid -) -> None: - """ - Raises a RuntimeError if at any point in time, there is a process running - on the given cuda device that is not the current process. - """ - - while True: - # get list of all PIDs running on nvidia devices - pids = [ - int(pid) - for pid in subprocess.check_output( - ["nvidia-smi", "--query-compute-apps=pid", "--format=csv,noheader"] - ) - .decode() - .strip() - .split("\n") - if pid != "" - ] - - for device_id in device_ids: - # get list of PIDs running on cuda device_id - pids_on_device_id = set( - [ - pid - for pid in pids - if subprocess.check_output( - [ - "nvidia-smi", - "--query-compute-apps=pid,used_memory", - "--format=csv,noheader,nounits", - f"--id={device_id}", - ] - ) - .decode() - .startswith(f"{pid},") - ] - ) - - # check if there is a process running on - # device_id that is not the current process - if len(pids_on_device_id) > 1: - os.kill(pid, signal.SIGTERM) - raise RuntimeError( - f"Expected only process {pid} on device {device_id}, " - f"found {len(pids_on_device_id)} processes " - f"with PIDs {pids_on_device_id}." - ) - - # sleep for 1 second - time.sleep(1) - - -DIFFUSION_TASKS = [ - "stable-diffusion", - "stable-diffusion-xl", -] - - -TEXT_GENERATION_TASKS = [ - "text-generation", - "text2text-generation", - "image-to-text", - "automatic-speech-recognition", -] - -# let's leave this here for now, it's a good list of tasks supported by transformers -ALL_TASKS = [ - "conversational", - "feature-extraction", - "fill-mask", - "text-generation", - "text2text-generation", - "text-classification", - "token-classification", - "multiple-choice", - "object-detection", - "question-answering", - "image-classification", - "image-segmentation", - "mask-generation", - "masked-im", - "semantic-segmentation", - "automatic-speech-recognition", - "audio-classification", - "audio-frame-classification", - "audio-xvector", - "image-to-text", - "stable-diffusion", - "stable-diffusion-xl", - "zero-shot-image-classification", - "zero-shot-object-detection", -] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..5f0be540b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,22 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[tool.black] +line-length = 119 +target-version = ['py37'] + +[tool.ruff] +# Never enforce `E501` and `C901` as they are too strict +ignore = ["E501", "C901"] +select = ["C", "E", "F", "I", "W"] diff --git a/requirements.txt b/requirements.txt index 60f2cf9f2..84e842f5b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,7 @@ git+https://github.com/huggingface/diffusers.git omegaconf==2.3.0 hydra-core==1.3.2 hydra_colorlog==1.2.0 +hydra-joblib-launcher==1.2.0 # system py3nvml diff --git a/setup.py b/setup.py index ac2095688..cd83b6f79 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,9 @@ -from setuptools import setup, find_packages - +from setuptools import find_packages, setup setup( name="optimum-benchmark", version="0.0.1", packages=find_packages(), - # add pytest as for optimum-benchmark[test] extras_require={ "test": ["pytest"], }, diff --git a/tests/configs/base_config.yaml b/tests/configs/base_config.yaml index 517f15c8b..c9806202f 100644 --- a/tests/configs/base_config.yaml +++ b/tests/configs/base_config.yaml @@ -1,4 +1,4 @@ -# This is a base config file that can potentially be used for all experiments +# This is a base config file that can potentially be used for all tests defaults: - backend: pytorch # default backend - benchmark: inference # default benchmark @@ -6,18 +6,29 @@ defaults: - _self_ # for hydra 1.1 compatibility - override hydra/job_logging: colorlog # colorful logging - override hydra/hydra_logging: colorlog # colorful logging + - override hydra/launcher: joblib # joblib launcher # hydra behavior configuration hydra: run: - dir: runs/${experiment_name} # where to save a run's output + # TODO: put the results somewhere after the workflow is done + dir: tests/runs/${experiment_name} # where to save a run's output sweep: - dir: sweeps/${experiment_name} # where to save a sweep's output + dir: tests/sweeps/${experiment_name} # where to save a sweep's output job: # we change the working directory during the run/sweep directory # this is useful for saving outputs in a separate directory chdir: true + launcher: + # we set the number of jobs to 2 since when using 1, joblib reuses the same process + n_jobs: 2 + prefer: processes + backend: multiprocessing + sweeper: + # now we force the sweeper to run one job at a time, achieving sequential isolation + max_batch_size: 1 backend: + # we turn off isolation checks because tests run on shared resources initial_isolation_check: false continous_isolation_check: false diff --git a/tests/configs/distributed_cuda_pytorch_inference_gpt2.yaml b/tests/configs/distributed_cuda_pytorch_inference_gpt2.yaml index c79b910a6..9524d0aec 100644 --- a/tests/configs/distributed_cuda_pytorch_inference_gpt2.yaml +++ b/tests/configs/distributed_cuda_pytorch_inference_gpt2.yaml @@ -4,7 +4,9 @@ defaults: experiment_name: distributed_cuda_pytorch_inference_gpt2 +# tiny-gpt2 fails probably because it's just too small to distribute model: gpt2 +task: text-generation device: cuda backend: @@ -14,6 +16,3 @@ hydra: job: env_set: CUDA_VISIBLE_DEVICES: 0,1 - sweeper: - params: - benchmark.input_shapes.batch_size: 1,2,4 diff --git a/tests/configs/distributed_cuda_pytorch_training_bert_ddp.yaml b/tests/configs/distributed_cuda_pytorch_training_bert_ddp.yaml index 2db9b8661..c2ea41614 100644 --- a/tests/configs/distributed_cuda_pytorch_training_bert_ddp.yaml +++ b/tests/configs/distributed_cuda_pytorch_training_bert_ddp.yaml @@ -4,16 +4,20 @@ defaults: - override benchmark: training experiment_name: distributed_cuda_pytorch_training_bert_ddp + +model: hf-internal-testing/tiny-random-bert task: text-classification -model: bert-base-uncased device: cuda backend: use_ddp: true + ddp_config: + # let's not use the default port to avoid network conflicts + rdzv_endpoint: 127.0.0.1:29509 benchmark: dataset_shapes: - dataset_size: 120 + dataset_size: 1200 sequence_length: 256 training_arguments: per_device_train_batch_size: 32 diff --git a/tests/configs/distributed_cuda_pytorch_training_bert_dp.yaml b/tests/configs/distributed_cuda_pytorch_training_bert_dp.yaml index 71bd5b7e9..a1996f235 100644 --- a/tests/configs/distributed_cuda_pytorch_training_bert_dp.yaml +++ b/tests/configs/distributed_cuda_pytorch_training_bert_dp.yaml @@ -5,7 +5,7 @@ defaults: experiment_name: distributed_cuda_pytorch_training_bert_dp -model: bert-base-uncased +model: hf-internal-testing/tiny-random-bert task: text-classification device: cuda diff --git a/tests/test_cli.py b/tests/test_cli.py index f9d4d39f7..c0a288ad0 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,28 +1,15 @@ import os -import pytest import subprocess -from omegaconf import OmegaConf +import pytest SINGLE_DEVICE_RUNS = [ - config - for config in os.listdir("tests/configs") - if config.endswith(".yaml") - and config != "base_config.yaml" - and "distributed" not in config -] - -DISTRIBUTED_RUNS = [ - config - for config in os.listdir("tests/configs") - if config.endswith(".yaml") - and config != "base_config.yaml" - and "distributed" in config + config for config in os.listdir("tests/configs") if config.endswith(".yaml") and config != "base_config.yaml" ] @pytest.mark.parametrize("config_file", SINGLE_DEVICE_RUNS) -def test_single_device_runs(config_file): +def test_configs(config_file): config_name = config_file.split(".")[0] result = subprocess.run( @@ -32,31 +19,10 @@ def test_single_device_runs(config_file): "tests/configs", "--config-name", config_name, + # "--multirun", + # TODO: might be worth removing names from yaml configs and have a list of test models here ], capture_output=True, ) assert result.returncode == 0, result.stderr.decode("utf-8") - - -@pytest.mark.parametrize("config_file", DISTRIBUTED_RUNS) -def test_distributed_runs(config_file): - config_name = config_file.split(".")[0] - - env_set = OmegaConf.load(f"tests/configs/{config_file}")["hydra"]["job"]["env_set"] - my_env = os.environ.copy() - my_env.update(env_set) - - result = subprocess.run( - [ - "optimum-benchmark", - "--config-dir", - "tests/configs", - "--config-name", - config_name, - ], - capture_output=True, - env=my_env, - ) - - assert result.returncode == 0, result.stderr.decode("utf-8") From 0b80878ed3053cfd2ce821538def0cb5cd9b6135 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 28 Aug 2023 05:50:57 +0200 Subject: [PATCH 5/8] use to_object for safety and update main_export function --- .../backends/neural_compressor/config.py | 6 +- .../backends/onnxruntime/backend.py | 8 +- .../backends/onnxruntime/config.py | 16 +-- optimum_benchmark/backends/openvino/config.py | 4 +- optimum_benchmark/backends/optimum_utils.py | 105 +++++++++++++++++- optimum_benchmark/backends/pytorch/config.py | 8 +- optimum_benchmark/benchmarks/inference.py | 4 +- optimum_benchmark/benchmarks/training.py | 2 +- optimum_benchmark/experiment.py | 28 +++-- 9 files changed, 139 insertions(+), 42 deletions(-) diff --git a/optimum_benchmark/backends/neural_compressor/config.py b/optimum_benchmark/backends/neural_compressor/config.py index 1a1fcb845..1108a000c 100644 --- a/optimum_benchmark/backends/neural_compressor/config.py +++ b/optimum_benchmark/backends/neural_compressor/config.py @@ -76,13 +76,11 @@ class INCConfig(BackendConfig): def __post_init__(self): if self.ptq_quantization: - self.ptq_quantization_config = OmegaConf.to_container( + self.ptq_quantization_config = OmegaConf.to_object( OmegaConf.merge(PTQ_QUANTIZATION_CONFIG, self.ptq_quantization_config) ) if self.ptq_quantization_config["approach"] == "static" and not self.calibration: raise ValueError("Calibration must be enabled when using static quantization.") if self.calibration: - self.calibration_config = OmegaConf.to_container( - OmegaConf.merge(CALIBRATION_CONFIG, self.calibration_config) - ) + self.calibration_config = OmegaConf.to_object(OmegaConf.merge(CALIBRATION_CONFIG, self.calibration_config)) diff --git a/optimum_benchmark/backends/onnxruntime/backend.py b/optimum_benchmark/backends/onnxruntime/backend.py index a77ad8ad9..79ba0d9fb 100644 --- a/optimum_benchmark/backends/onnxruntime/backend.py +++ b/optimum_benchmark/backends/onnxruntime/backend.py @@ -198,10 +198,14 @@ def optimize_onnx_files(self) -> None: LOGGER.info("\t+ Processing optimization config") if self.config.auto_optimization is not None: optimization_config = AutoOptimizationConfig.with_optimization_level( - optimization_level=self.config.auto_optimization, **self.config.auto_optimization_config + optimization_level=self.config.auto_optimization, + for_gpu=self.device.type == "cuda", + **self.config.auto_optimization_config, ) elif self.config.optimization: - optimization_config = OptimizationConfig(**self.config.optimization_config) + optimization_config = OptimizationConfig( + optimize_for_gpu=self.device.type == "cuda", **self.config.optimization_config + ) LOGGER.info("\t+ Creating optimizer") optimizer = ORTOptimizer.from_pretrained(self.model, file_names=self.onnx_files_names) LOGGER.info("\t+ Optimizing ORTModel") diff --git a/optimum_benchmark/backends/onnxruntime/config.py b/optimum_benchmark/backends/onnxruntime/config.py index decf5d482..3564a0cbf 100644 --- a/optimum_benchmark/backends/onnxruntime/config.py +++ b/optimum_benchmark/backends/onnxruntime/config.py @@ -42,7 +42,6 @@ def onnxruntime_version(): OPTIMIZATION_CONFIG = { "optimization_level": 1, # 0, 1, 2, 99 - "optimize_for_gpu": "${is_gpu:${device}}", "fp16": False, "enable_transformers_specific_optimizations": True, "enable_gelu_approximation": False, @@ -64,8 +63,7 @@ def onnxruntime_version(): } AUTO_OPTIMIZATION_CONFIG = { - "for_gpu": "${is_gpu:${device}}", - # full auto optimization config depends on the level so we keep it minimal + # auto optimization config depends on the level so we keep it minimal } QUANTIZATION_CONFIG = { @@ -153,11 +151,11 @@ def __post_init__(self): raise NotImplementedError("Can't convert an exported model's weights to a different dtype.") if self.optimization: - self.optimization_config = OmegaConf.to_container( + self.optimization_config = OmegaConf.to_object( OmegaConf.merge(OPTIMIZATION_CONFIG, self.optimization_config) ) if self.quantization: - self.quantization_config = OmegaConf.to_container( + self.quantization_config = OmegaConf.to_object( OmegaConf.merge(QUANTIZATION_CONFIG, self.quantization_config) ) # raise ValueError if the quantization is static but calibration is not enabled @@ -167,11 +165,11 @@ def __post_init__(self): ) if self.auto_optimization is not None: - self.auto_optimization_config = OmegaConf.to_container( + self.auto_optimization_config = OmegaConf.to_object( OmegaConf.merge(AUTO_OPTIMIZATION_CONFIG, self.auto_optimization_config) ) if self.auto_quantization is not None: - self.auto_quantization_config = OmegaConf.to_container( + self.auto_quantization_config = OmegaConf.to_object( OmegaConf.merge(AUTO_QUANTIZATION_CONFIG, self.auto_quantization_config) ) if self.auto_quantization_config["is_static"] and not self.calibration: @@ -180,6 +178,4 @@ def __post_init__(self): ) if self.calibration: - self.calibration_config = OmegaConf.to_container( - OmegaConf.merge(CALIBRATION_CONFIG, self.calibration_config) - ) + self.calibration_config = OmegaConf.to_object(OmegaConf.merge(CALIBRATION_CONFIG, self.calibration_config)) diff --git a/optimum_benchmark/backends/openvino/config.py b/optimum_benchmark/backends/openvino/config.py index e54c2aefd..1f6a49aea 100644 --- a/optimum_benchmark/backends/openvino/config.py +++ b/optimum_benchmark/backends/openvino/config.py @@ -53,12 +53,12 @@ class OVConfig(BackendConfig): def __post_init__(self): if self.quantization: - self.quantization_config = OmegaConf.to_container( + self.quantization_config = OmegaConf.to_object( OmegaConf.merge(QUANTIZATION_CONFIG, self.quantization_config) ) if not self.calibration: raise ValueError("OpenVINO quantization requires enabling calibration.") else: - self.calibration_config = OmegaConf.to_container( + self.calibration_config = OmegaConf.to_object( OmegaConf.merge(CALIBRATION_CONFIG, self.calibration_config) ) diff --git a/optimum_benchmark/backends/optimum_utils.py b/optimum_benchmark/backends/optimum_utils.py index a064cba08..27cb3da55 100644 --- a/optimum_benchmark/backends/optimum_utils.py +++ b/optimum_benchmark/backends/optimum_utils.py @@ -18,6 +18,7 @@ export_models, is_torch_available, logger, + maybe_load_preprocessors, maybe_save_preprocessors, ) @@ -37,7 +38,7 @@ def main_export( fp16: Optional[bool] = False, optimize: Optional[str] = None, monolith: bool = False, - # no_post_process: bool = False, + no_post_process: bool = False, framework: Optional[str] = None, atol: Optional[float] = None, cache_dir: Optional[str] = None, @@ -49,16 +50,101 @@ def main_export( local_files_only: bool = False, use_auth_token: Optional[Union[bool, str]] = None, for_ort: bool = False, - # do_validation: bool = True, + do_validation: bool = True, model_kwargs: Optional[Dict[str, Any]] = None, custom_onnx_configs: Optional[Dict[str, "OnnxConfig"]] = None, fn_get_submodels: Optional[Callable] = None, - # use_subprocess: bool = False, + use_subprocess: bool = False, + _variant: str = "default", ######################################## model: Optional["PreTrainedModel"] = None, ######################################## **kwargs_shapes, ): + """ + Full-suite ONNX export. + + Args: + > Required parameters + + model_name_or_path (`str`): + Model ID on huggingface.co or path on disk to the model repository to export. + output (`Union[str, Path]`): + Path indicating the directory where to store the generated ONNX model. + + > Optional parameters + + task (`Optional[str]`, defaults to `None`): + The task to export the model for. If not specified, the task will be auto-inferred based on the model. For decoder models, + use `xxx-with-past` to export the model using past key values in the decoder. + opset (`Optional[int]`, defaults to `None`): + If specified, ONNX opset version to export the model with. Otherwise, the default opset for the given model architecture + will be used. + device (`str`, defaults to `"cpu"`): + The device to use to do the export. Defaults to "cpu". + fp16 (`Optional[bool]`, defaults to `"False"`): + Use half precision during the export. PyTorch-only, requires `device="cuda"`. + optimize (`Optional[str]`, defaults to `None`): + Allows to run ONNX Runtime optimizations directly during the export. Some of these optimizations are specific to + ONNX Runtime, and the resulting ONNX will not be usable with other runtime as OpenVINO or TensorRT. + Available options: `"O1", "O2", "O3", "O4"`. Reference: [`~optimum.onnxruntime.AutoOptimizationConfig`] + monolith (`bool`, defaults to `False`): + Forces to export the model as a single ONNX file. + no_post_process (`bool`, defaults to `False`): + Allows to disable any post-processing done by default on the exported ONNX models. + framework (`Optional[str]`, defaults to `None`): + The framework to use for the ONNX export (`"pt"` or `"tf"`). If not provided, will attempt to automatically detect + the framework for the checkpoint. + atol (`Optional[float]`, defaults to `None`): + If specified, the absolute difference tolerance when validating the model. Otherwise, the default atol for the model will be used. + cache_dir (`Optional[str]`, defaults to `None`): + Path indicating where to store cache. The default Hugging Face cache path will be used by default. + trust_remote_code (`bool`, defaults to `False`): + Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories + you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the + model repository. + pad_token_id (`Optional[int]`, defaults to `None`): + This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it. + subfolder (`str`, defaults to `""`): + In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can + specify the folder name here. + revision (`str`, defaults to `"main"`): + Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id. + force_download (`bool`, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + local_files_only (`Optional[bool]`, defaults to `False`): + Whether or not to only look at local files (i.e., do not try to download the model). + use_auth_token (`Optional[str]`, defaults to `None`): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `transformers-cli login` (stored in `~/.huggingface`). + model_kwargs (`Optional[Dict[str, Any]]`, defaults to `None`): + Experimental usage: keyword arguments to pass to the model during + the export. This argument should be used along the `custom_onnx_configs` argument + in case, for example, the model inputs/outputs are changed (for example, if + `model_kwargs={"output_attentions": True}` is passed). + custom_onnx_configs (`Optional[Dict[str, OnnxConfig]]`, defaults to `None`): + Experimental usage: override the default ONNX config used for the given model. This argument may be useful for advanced users that desire a finer-grained control on the export. An example is available [here](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model). + fn_get_submodels (`Optional[Callable]`, defaults to `None`): + Experimental usage: Override the default submodels that are used at the export. This is + especially useful when exporting a custom architecture that needs to split the ONNX (e.g. encoder-decoder). If unspecified with custom models, optimum will try to use the default submodels used for the given task, with no guarantee of success. + use_subprocess (`bool`): + Do the ONNX exported model validation in subprocesses. This is especially useful when + exporting on CUDA device, where ORT does not release memory at inference session + destruction. When set to `True`, the `main_export` call should be guarded in + `if __name__ == "__main__":` block. + _variant (`str`, defaults to `default`): + Specify the variant of the ONNX export to use. + **kwargs_shapes (`Dict`): + Shapes to use during inference. This argument allows to override the default shapes used during the ONNX export. + + Example usage: + ```python + >>> from optimum.exporters.onnx import main_export + + >>> main_export("gpt2", output="gpt2_onnx/") + ``` + """ if optimize == "O4" and device != "cuda": raise ValueError( "Requested O4 optimization, but this optimization requires to do the export on GPU." @@ -180,6 +266,11 @@ def main_export( possible_synonyms = "" logger.info(f"Automatic task detection to {task}{possible_synonyms}.") + # The preprocessors are loaded as they may be useful to export the model. Notably, some of the static input shapes may be stored in the + # preprocessors config. + preprocessors = maybe_load_preprocessors( + model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code + ) onnx_config, models_and_onnx_configs = _get_submodels_and_onnx_configs( model=model, task=task, @@ -187,6 +278,8 @@ def main_export( custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {}, custom_architecture=custom_architecture, fn_get_submodels=fn_get_submodels, + preprocessors=preprocessors, + _variant=_variant, ) if not is_stable_diffusion: @@ -274,6 +367,7 @@ def main_export( dtype="fp16" if fp16 is True else None, model_kwargs=model_kwargs, ) + # for the post processing later we don't wanna keep models if len(models_and_onnx_configs) == 2: models_and_onnx_configs = { @@ -291,8 +385,7 @@ def main_export( return onnx_config, models_and_onnx_configs # if optimize is not None: - # from optimum.onnxruntime import ORTOptimizer - # from optimum.onnxruntime.configuration import AutoOptimizationConfig + # from ...onnxruntime import AutoOptimizationConfig, ORTOptimizer # if onnx_files_subpaths is None: # onnx_files_subpaths = [key + ".onnx" for key in models_and_onnx_configs.keys()] @@ -308,7 +401,7 @@ def main_export( # if not no_post_process and not is_stable_diffusion: # try: # logger.info("Post-processing the exported models...") - # (models_and_onnx_configs, onnx_files_subpaths) = onnx_config.post_process_exported_models( + # models_and_onnx_configs, onnx_files_subpaths = onnx_config.post_process_exported_models( # output, models_and_onnx_configs, onnx_files_subpaths # ) # except Exception as e: diff --git a/optimum_benchmark/backends/pytorch/config.py b/optimum_benchmark/backends/pytorch/config.py index ab2cc8fa9..ad8884c1b 100644 --- a/optimum_benchmark/backends/pytorch/config.py +++ b/optimum_benchmark/backends/pytorch/config.py @@ -105,9 +105,7 @@ def __post_init__(self): CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None) if self.torch_compile: - self.torch_compile_config = OmegaConf.to_container( - OmegaConf.merge(COMPILE_CONFIG, self.torch_compile_config) - ) + self.torch_compile_config = OmegaConf.to_object(OmegaConf.merge(COMPILE_CONFIG, self.torch_compile_config)) if self.device_map is not None: assert CUDA_VISIBLE_DEVICES is not None, "`device_map` can only be used when CUDA_VISIBLE_DEVICES is set." @@ -129,7 +127,7 @@ def __post_init__(self): f"`quantization_strategy` must be one of {list(QUANTIZATION_CONFIGS.keys())}. Got {self.quantization_strategy} instead." ) QUANTIZATION_CONFIG = QUANTIZATION_CONFIGS[self.quantization_strategy] - self.quantization_config = OmegaConf.to_container( + self.quantization_config = OmegaConf.to_object( OmegaConf.merge(QUANTIZATION_CONFIG, self.quantization_config) ) @@ -137,7 +135,7 @@ def __post_init__(self): if CUDA_VISIBLE_DEVICES is None: raise ValueError("`use_ddp` can only be used when CUDA_VISIBLE_DEVICES is set.") - self.ddp_config = OmegaConf.to_container(OmegaConf.merge(DDP_CONFIG, self.ddp_config), resolve=True) + self.ddp_config = OmegaConf.to_object(OmegaConf.merge(DDP_CONFIG, self.ddp_config)) # TODO: check if it's not possible to use DDP with multiple nodes if self.ddp_config["max_nodes"] > 1 or self.ddp_config["min_nodes"] > 1: raise NotImplementedError("Currently, PyTorch DDP benchmark only supports training on a single node.") diff --git a/optimum_benchmark/benchmarks/inference.py b/optimum_benchmark/benchmarks/inference.py index eadbc61c5..2b603cf10 100644 --- a/optimum_benchmark/benchmarks/inference.py +++ b/optimum_benchmark/benchmarks/inference.py @@ -80,10 +80,10 @@ class InferenceConfig(BenchmarkConfig): def __post_init__(self): if self.can_diffuse: - self.forward_kwargs = OmegaConf.to_container(OmegaConf.merge(self.forward_kwargs, DIFUSION_CONFIG)) + self.forward_kwargs = OmegaConf.to_object(OmegaConf.merge(self.forward_kwargs, DIFUSION_CONFIG)) if self.can_generate: - self.generate_kwargs = OmegaConf.to_container(OmegaConf.merge(self.generate_kwargs, GENERATE_CONFIG)) + self.generate_kwargs = OmegaConf.to_object(OmegaConf.merge(self.generate_kwargs, GENERATE_CONFIG)) if self.generate_kwargs["max_new_tokens"] != self.generate_kwargs["min_new_tokens"]: raise ValueError("`max_new_tokens` and `min_new_tokens` must be equal for fixed length output.") diff --git a/optimum_benchmark/benchmarks/training.py b/optimum_benchmark/benchmarks/training.py index 84b0be949..e2bf17cc3 100644 --- a/optimum_benchmark/benchmarks/training.py +++ b/optimum_benchmark/benchmarks/training.py @@ -22,7 +22,7 @@ class TrainingConfig(BenchmarkConfig): _target_: str = "optimum_benchmark.benchmarks.training.TrainingBenchmark" # training options - warmup_steps: int = 10 + warmup_steps: int = 40 # still thinks this too high # dataset options dataset_shapes: Dict = field( diff --git a/optimum_benchmark/experiment.py b/optimum_benchmark/experiment.py index 85253f80d..3859e0319 100644 --- a/optimum_benchmark/experiment.py +++ b/optimum_benchmark/experiment.py @@ -9,7 +9,7 @@ from diffusers import __version__ as diffusers_version from hydra.core.config_store import ConfigStore from hydra.utils import get_class -from omegaconf import DictConfig, OmegaConf, SCMode +from omegaconf import DictConfig, OmegaConf from optimum.exporters import TasksManager from optimum.version import __version__ as optimum_version from transformers import __version__ as transformers_version @@ -94,16 +94,12 @@ class ExperimentConfig: @hydra.main(version_base=None) def run_experiment(experiment: DictConfig) -> None: - experiment = OmegaConf.to_container(experiment, structured_config_mode=SCMode.INSTANTIATE, resolve=True) + # This is required to trigger __post_init__. Reference: https://github.com/omry/omegaconf/issues/377 + experiment: ExperimentConfig = OmegaConf.to_object(experiment) # Save the config OmegaConf.save(experiment, "hydra_config.yaml", resolve=True) - # Allocate requested benchmark - benchmark_factory: Type[Benchmark] = get_class(experiment.benchmark._target_) - benchmark: Benchmark = benchmark_factory() - benchmark.configure(experiment.benchmark) - # Allocate requested backend backend_factory: Type[Backend] = get_class(experiment.backend._target_) backend: Backend = backend_factory( @@ -112,18 +108,30 @@ def run_experiment(experiment: DictConfig) -> None: device=experiment.device, hub_kwargs=experiment.hub_kwargs, ) - try: # Configure the backend backend.configure(experiment.backend) + except Exception as e: + LOGGER.error("Error during backend configuration: %s", e) + raise e + + # Allocate requested benchmark + benchmark_factory: Type[Benchmark] = get_class(experiment.benchmark._target_) + benchmark: Benchmark = benchmark_factory() + try: + benchmark.configure(experiment.benchmark) + except Exception as e: + LOGGER.error("Error during benchmark configuration: %s", e) + raise e + + try: # Run the benchmark benchmark.run(backend) # Save the benchmark results benchmark.save() # Clean up the backend backend.clean() - except Exception as e: - LOGGER.error("Error during experiment: %s", e) + LOGGER.error("Error during benchmark execution: %s", e) backend.clean() raise e From cefbad3ff9b2a9c81422b77611b2c7a2160a1549 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 28 Aug 2023 06:34:20 +0200 Subject: [PATCH 6/8] remove dead code --- optimum_benchmark/backends/onnxruntime/config.py | 12 ++++-------- optimum_benchmark/benchmarks/training.py | 2 +- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/optimum_benchmark/backends/onnxruntime/config.py b/optimum_benchmark/backends/onnxruntime/config.py index 3564a0cbf..c41c4b63d 100644 --- a/optimum_benchmark/backends/onnxruntime/config.py +++ b/optimum_benchmark/backends/onnxruntime/config.py @@ -19,10 +19,6 @@ def onnxruntime_version(): return "ort:unknown" -OmegaConf.register_new_resolver( - "is_gpu", - lambda device: "cuda" in device.lower(), -) OmegaConf.register_new_resolver( "is_profiling", lambda benchmark_name: benchmark_name == "profiling", @@ -95,8 +91,6 @@ def onnxruntime_version(): "preprocess_batch": True, "preprocess_class": "optimum_benchmark.preprocessors.glue.GluePreprocessor", } -PROVIDER_OPTIONS = {"device_id": "${infer_device_id:${device}}"} -SESSION_OPTIONS = {"enable_profiling": "${is_profiling:${benchmark.name}}"} @dataclass @@ -116,12 +110,14 @@ class ORTConfig(BackendConfig): # provider options provider: str = "${infer_provider:${device}}" device_id: Optional[int] = "${oc.deprecated:backend.provider_options.device_id}" - provider_options: Dict[str, Any] = field(default_factory=lambda: PROVIDER_OPTIONS) + provider_options: Dict[str, Any] = field(default_factory=lambda: {"device_id": "${infer_device_id:${device}}"}) # inference options use_io_binding: bool = "${is_gpu:${device}}" enable_profiling: bool = "${oc.deprecated:backend.session_options.enable_profiling}" - session_options: Dict[str, Any] = field(default_factory=lambda: SESSION_OPTIONS) + session_options: Dict[str, Any] = field( + default_factory=lambda: {"enable_profiling": "${is_profiling:${benchmark.name}}"} + ) # optimization options optimization: bool = False diff --git a/optimum_benchmark/benchmarks/training.py b/optimum_benchmark/benchmarks/training.py index e2bf17cc3..fd8d6e5ed 100644 --- a/optimum_benchmark/benchmarks/training.py +++ b/optimum_benchmark/benchmarks/training.py @@ -22,7 +22,7 @@ class TrainingConfig(BenchmarkConfig): _target_: str = "optimum_benchmark.benchmarks.training.TrainingBenchmark" # training options - warmup_steps: int = 40 # still thinks this too high + warmup_steps: int = 40 # still thinks this too high # dataset options dataset_shapes: Dict = field( From f914694b636d06706a7e5aa8f0907fa379e40965 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 28 Aug 2023 06:49:26 +0200 Subject: [PATCH 7/8] added error for applying gptq --- optimum_benchmark/backends/pytorch/backned.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/optimum_benchmark/backends/pytorch/backned.py b/optimum_benchmark/backends/pytorch/backned.py index a83482da0..6be3dfec7 100644 --- a/optimum_benchmark/backends/pytorch/backned.py +++ b/optimum_benchmark/backends/pytorch/backned.py @@ -9,7 +9,7 @@ from optimum.bettertransformer import BetterTransformer from torch.distributed.elastic.multiprocessing.errors import record from torch.distributed.launcher.api import LaunchConfig, elastic_launch -from transformers import BitsAndBytesConfig, GPTQConfig, Trainer, TrainingArguments +from transformers import BitsAndBytesConfig, Trainer, TrainingArguments # GPTQConfig from transformers.utils.fx import symbolic_trace if TYPE_CHECKING: @@ -96,7 +96,12 @@ def configure(self, config: PyTorchConfig) -> None: def load_model_from_pretrained(self) -> None: if self.config.quantization_strategy == "gptq": LOGGER.info("\t+ Processing GPTQ config") - quantization_config = GPTQConfig(**self.config.quantization_config) + raise NotImplementedError( + "Applying GPTQ quantization on pretrained models is not supported yet. " + "If the model is already quantized, you don't need to specify the quantization strategy." + ) + # need to process dataset, tokenizer, etc. + # quantization_config = GPTQConfig(**self.config.quantization_config) elif self.config.quantization_strategy == "bnb": LOGGER.info("\t+ Processing BnB config") quantization_config = BitsAndBytesConfig(**self.config.quantization_config) From 8900373ae7c2e7aa39998f267b30fddd19fcbd59 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 28 Aug 2023 06:51:32 +0200 Subject: [PATCH 8/8] fix --- optimum_benchmark/backends/onnxruntime/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum_benchmark/backends/onnxruntime/config.py b/optimum_benchmark/backends/onnxruntime/config.py index c41c4b63d..9ae25e927 100644 --- a/optimum_benchmark/backends/onnxruntime/config.py +++ b/optimum_benchmark/backends/onnxruntime/config.py @@ -19,6 +19,7 @@ def onnxruntime_version(): return "ort:unknown" +OmegaConf.register_new_resolver("is_gpu", lambda device: "cuda" in device) OmegaConf.register_new_resolver( "is_profiling", lambda benchmark_name: benchmark_name == "profiling",