diff --git a/docs/source/en/pipeline_tutorial.mdx b/docs/source/en/pipeline_tutorial.mdx index 8560d856f39e..00dceeb4f243 100644 --- a/docs/source/en/pipeline_tutorial.mdx +++ b/docs/source/en/pipeline_tutorial.mdx @@ -105,6 +105,8 @@ If the model is too large for a single GPU, you can set `device_map="auto"` to a generator(model="openai/whisper-large", device_map="auto") ``` +Note that if `device_map="auto"` is passed, there is no need to add the argument `device=device` when instantiating your `pipeline` as you may encounter some unexpected behavior! + ### Batch size By default, pipelines will not batch inference for reasons explained in detail [here](https://huggingface.co/docs/transformers/main_classes/pipelines#pipeline-batching). The reason is that batching is not necessarily faster, and can actually be quite slower in some cases. @@ -257,4 +259,32 @@ sudo apt install -y tesseract-ocr pip install pytesseract ``` - \ No newline at end of file + + +## Using `pipeline` on large models with 🤗 `accelerate`: + +You can easily run `pipeline` on large models using 🤗 `accelerate`! First make sure you have installed `accelerate` with `pip install accelerate`. + +First load your model using `device_map="auto"`! We will use `facebook/opt-1.3b` for our example. + +```py +# pip install accelerate +import torch +from transformers import pipeline + +pipe = pipeline(model="facebook/opt-1.3b", torch_dtype=torch.bfloat16, device_map="auto") +output = pipe("This is a cool example!", do_sample=True, top_p=0.95) +``` + +You can also pass 8-bit loaded models if you install `bitsandbytes` and add the argument `load_in_8bit=True` + +```py +# pip install accelerate bitsandbytes +import torch +from transformers import pipeline + +pipe = pipeline(model="facebook/opt-1.3b", device_map="auto", model_kwargs={"load_in_8bit": True}) +output = pipe("This is a cool example!", do_sample=True, top_p=0.95) +``` + +Note that you can replace the checkpoint with any of the Hugging Face model that supports large model loading such as BLOOM! \ No newline at end of file diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py index e14d74457990..3d42d483b75d 100755 --- a/src/transformers/pipelines/__init__.py +++ b/src/transformers/pipelines/__init__.py @@ -741,6 +741,11 @@ def pipeline( 'You cannot use both `pipeline(... device_map=..., model_kwargs={"device_map":...})` as those' " arguments might conflict, use only one.)" ) + if device is not None: + logger.warning( + "Both `device` and `device_map` are specified. `device` will override `device_map`. You" + " will most likely encounter unexpected behavior. Please remove `device` and keep `device_map`." + ) model_kwargs["device_map"] = device_map if torch_dtype is not None: if "torch_dtype" in model_kwargs: diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py index 8c552cbdc307..5075fa6c56e6 100644 --- a/src/transformers/pipelines/automatic_speech_recognition.py +++ b/src/transformers/pipelines/automatic_speech_recognition.py @@ -287,9 +287,9 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline): installed. If no framework is specified, will default to the one currently installed. If no framework is specified and both frameworks are installed, will default to the framework of the `model`, or to PyTorch if no model is provided. - device (`int`, *optional*, defaults to -1): - Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on - the associated CUDA device id. + device (Union[`int`, `torch.device`], *optional*): + Device ordinal for CPU/GPU supports. Setting this to `None` will leverage CPU, a positive will run the + model on the associated CUDA device id. decoder (`pyctcdecode.BeamSearchDecoderCTC`, *optional*): [PyCTCDecode's BeamSearchDecoderCTC](https://github.com/kensho-technologies/pyctcdecode/blob/2fd33dc37c4111417e08d89ccd23d28e9b308d19/pyctcdecode/decoder.py#L180) diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py index 3905d28d26d2..80248173487c 100644 --- a/src/transformers/pipelines/base.py +++ b/src/transformers/pipelines/base.py @@ -749,7 +749,7 @@ def __init__( framework: Optional[str] = None, task: str = "", args_parser: ArgumentHandler = None, - device: Union[int, str, "torch.device"] = -1, + device: Union[int, str, "torch.device"] = None, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, binary_output: bool = False, **kwargs, @@ -764,6 +764,19 @@ def __init__( self.image_processor = image_processor self.modelcard = modelcard self.framework = framework + + if self.framework == "pt" and device is not None: + self.model = self.model.to(device=device) + + if device is None: + # `accelerate` device map + hf_device_map = getattr(self.model, "hf_device_map", None) + if hf_device_map is not None: + # Take the first device used by `accelerate`. + device = next(iter(hf_device_map.values())) + else: + device = -1 + if is_torch_available() and self.framework == "pt": if isinstance(device, torch.device): self.device = device @@ -774,14 +787,10 @@ def __init__( else: self.device = torch.device(f"cuda:{device}") else: - self.device = device + self.device = device if device is not None else -1 self.torch_dtype = torch_dtype self.binary_output = binary_output - # Special handling - if self.framework == "pt" and self.device.type != "cpu": - self.model = self.model.to(self.device) - # Update config with task specific parameters task_specific_params = self.model.config.task_specific_params if task_specific_params is not None and task in task_specific_params: diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py index d4bb7f210290..fad64d71ff71 100644 --- a/src/transformers/pipelines/question_answering.py +++ b/src/transformers/pipelines/question_answering.py @@ -255,7 +255,6 @@ def __init__( tokenizer: PreTrainedTokenizer, modelcard: Optional[ModelCard] = None, framework: Optional[str] = None, - device: int = -1, task: str = "", **kwargs, ): @@ -264,7 +263,6 @@ def __init__( tokenizer=tokenizer, modelcard=modelcard, framework=framework, - device=device, task=task, **kwargs, ) diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py index 2e97810e7101..1f329926813f 100644 --- a/tests/pipelines/test_pipelines_text_generation.py +++ b/tests/pipelines/test_pipelines_text_generation.py @@ -312,3 +312,12 @@ def test_small_model_fp16(self): pipe = pipeline(model="hf-internal-testing/tiny-random-bloom", device=0, torch_dtype=torch.float16) pipe("This is a test") + + @require_torch + @require_accelerate + @require_torch_gpu + def test_pipeline_accelerate_top_p(self): + import torch + + pipe = pipeline(model="hf-internal-testing/tiny-random-bloom", device_map="auto", torch_dtype=torch.float16) + pipe("This is a test", do_sample=True, top_p=0.5)