Skip to content
32 changes: 31 additions & 1 deletion docs/source/en/pipeline_tutorial.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ If the model is too large for a single GPU, you can set `device_map="auto"` to a
generator(model="openai/whisper-large", device_map="auto")
```

Note that if `device_map="auto"` is passed, there is no need to add the argument `device=device` when instantiating your `pipeline` as you may encounter some unexpected behavior!

### Batch size

By default, pipelines will not batch inference for reasons explained in detail [here](https://huggingface.co/docs/transformers/main_classes/pipelines#pipeline-batching). The reason is that batching is not necessarily faster, and can actually be quite slower in some cases.
Expand Down Expand Up @@ -257,4 +259,32 @@ sudo apt install -y tesseract-ocr
pip install pytesseract
```

</Tip>
</Tip>

## Using `pipeline` on large models with 🤗 `accelerate`:

You can easily run `pipeline` on large models using 🤗 `accelerate`! First make sure you have installed `accelerate` with `pip install accelerate`.

First load your model using `device_map="auto"`! We will use `facebook/opt-1.3b` for our example.

```py
# pip install accelerate
import torch
from transformers import pipeline

pipe = pipeline(model="facebook/opt-1.3b", torch_dtype=torch.bfloat16, device_map="auto")
output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
```

You can also pass 8-bit loaded models if you install `bitsandbytes` and add the argument `load_in_8bit=True`

```py
# pip install accelerate bitsandbytes
import torch
from transformers import pipeline

pipe = pipeline(model="facebook/opt-1.3b", device_map="auto", model_kwargs={"load_in_8bit": True})
output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
```

Note that you can replace the checkpoint with any of the Hugging Face model that supports large model loading such as BLOOM!
5 changes: 5 additions & 0 deletions src/transformers/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,11 @@ def pipeline(
'You cannot use both `pipeline(... device_map=..., model_kwargs={"device_map":...})` as those'
" arguments might conflict, use only one.)"
)
if device is not None:
logger.warning(
"Both `device` and `device_map` are specified. `device` will override `device_map`. You"
" will most likely encounter unexpected behavior. Please remove `device` and keep `device_map`."
)
model_kwargs["device_map"] = device_map
if torch_dtype is not None:
if "torch_dtype" in model_kwargs:
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/pipelines/automatic_speech_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,9 +287,9 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
installed. If no framework is specified, will default to the one currently installed. If no framework is
specified and both frameworks are installed, will default to the framework of the `model`, or to PyTorch if
no model is provided.
device (`int`, *optional*, defaults to -1):
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
the associated CUDA device id.
device (Union[`int`, `torch.device`], *optional*):
Device ordinal for CPU/GPU supports. Setting this to `None` will leverage CPU, a positive will run the
model on the associated CUDA device id.
decoder (`pyctcdecode.BeamSearchDecoderCTC`, *optional*):
[PyCTCDecode's
BeamSearchDecoderCTC](https://github.com/kensho-technologies/pyctcdecode/blob/2fd33dc37c4111417e08d89ccd23d28e9b308d19/pyctcdecode/decoder.py#L180)
Expand Down
21 changes: 15 additions & 6 deletions src/transformers/pipelines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -749,7 +749,7 @@ def __init__(
framework: Optional[str] = None,
task: str = "",
args_parser: ArgumentHandler = None,
device: Union[int, str, "torch.device"] = -1,
device: Union[int, str, "torch.device"] = None,
torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
binary_output: bool = False,
**kwargs,
Expand All @@ -764,6 +764,19 @@ def __init__(
self.image_processor = image_processor
self.modelcard = modelcard
self.framework = framework

if self.framework == "pt" and device is not None:
self.model = self.model.to(device=device)

if device is None:
# `accelerate` device map
hf_device_map = getattr(self.model, "hf_device_map", None)
if hf_device_map is not None:
# Take the first device used by `accelerate`.
device = next(iter(hf_device_map.values()))
else:
device = -1

if is_torch_available() and self.framework == "pt":
if isinstance(device, torch.device):
self.device = device
Expand All @@ -774,14 +787,10 @@ def __init__(
else:
self.device = torch.device(f"cuda:{device}")
else:
self.device = device
self.device = device if device is not None else -1
self.torch_dtype = torch_dtype
self.binary_output = binary_output

# Special handling
if self.framework == "pt" and self.device.type != "cpu":
self.model = self.model.to(self.device)

# Update config with task specific parameters
task_specific_params = self.model.config.task_specific_params
if task_specific_params is not None and task in task_specific_params:
Expand Down
2 changes: 0 additions & 2 deletions src/transformers/pipelines/question_answering.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,6 @@ def __init__(
tokenizer: PreTrainedTokenizer,
modelcard: Optional[ModelCard] = None,
framework: Optional[str] = None,
device: int = -1,
task: str = "",
**kwargs,
):
Expand All @@ -264,7 +263,6 @@ def __init__(
tokenizer=tokenizer,
modelcard=modelcard,
framework=framework,
device=device,
task=task,
**kwargs,
)
Expand Down
9 changes: 9 additions & 0 deletions tests/pipelines/test_pipelines_text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,3 +312,12 @@ def test_small_model_fp16(self):

pipe = pipeline(model="hf-internal-testing/tiny-random-bloom", device=0, torch_dtype=torch.float16)
pipe("This is a test")

@require_torch
@require_accelerate
@require_torch_gpu
def test_pipeline_accelerate_top_p(self):
import torch

pipe = pipeline(model="hf-internal-testing/tiny-random-bloom", device_map="auto", torch_dtype=torch.float16)
pipe("This is a test", do_sample=True, top_p=0.5)