huggingface · younesbelkada · Feb 10, 2023 · Feb 6, 2023 · Feb 8, 2023 · Feb 8, 2023
diff --git a/docs/source/en/pipeline_tutorial.mdx b/docs/source/en/pipeline_tutorial.mdx
@@ -105,6 +105,8 @@ If the model is too large for a single GPU, you can set `device_map="auto"` to a
 generator(model="openai/whisper-large", device_map="auto")
 ```
 
+Note that if  `device_map="auto"` is passed, there is no need to add the argument `device=device` when instantiating your `pipeline` as you may encounter some unexpected behavior!
+
 ### Batch size
 
 By default, pipelines will not batch inference for reasons explained in detail [here](https://huggingface.co/docs/transformers/main_classes/pipelines#pipeline-batching). The reason is that batching is not necessarily faster, and can actually be quite slower in some cases.
@@ -257,4 +259,37 @@ sudo apt install -y tesseract-ocr
 pip install pytesseract
 ```
 
-</Tip>
+</Tip>
+
+## Using `pipeline` on large models with 🤗 `accelerate`:
+
+You can easily run `pipeline` on large models using 🤗 `accelerate`! First make sure you have installed `accelerate` with `pip install accelerate`. 
+
+Let's assume you fullfill the hardware requirements to run a large model such as `bloom` (which has 176B parameters, so ~350GB in `bfloat16`). First load your model
+using `device_map="auto"`
+
+```py
+# pip install accelerate
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+
+model = AutoModelForCausalLM.from_pretrained("bigscience/bloom", torch_dtype=torch.bfloat16, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom")
+
+pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
+```
+
+You can also pass 8-bit loaded models if you install `bitsandbytes` and add the argument `load_in_8bit=True`
+
+```py
+# pip install accelerate bitsandbytes
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+
+model = AutoModelForCausalLM.from_pretrained("bigscience/bloom", device_map="auto", load_in_8bit=True)
+tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom")
+
+pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
+```
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
@@ -749,7 +749,7 @@ def __init__(
         framework: Optional[str] = None,
         task: str = "",
         args_parser: ArgumentHandler = None,
-        device: Union[int, str, "torch.device"] = -1,
+        device: Union[int, str, "torch.device"] = None,
         torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
         binary_output: bool = False,
         **kwargs,
@@ -769,18 +769,41 @@ def __init__(
                 self.device = device
             elif isinstance(device, str):
                 self.device = torch.device(device)
-            elif device < 0:
+            elif device is None or device < 0:
                 self.device = torch.device("cpu")
             else:
                 self.device = torch.device(f"cuda:{device}")
         else:
-            self.device = device
+            self.device = device if device is not None else -1
         self.torch_dtype = torch_dtype
         self.binary_output = binary_output
 
         # Special handling
-        if self.framework == "pt" and self.device.type != "cpu":
-            self.model = self.model.to(self.device)
+        if self.framework == "pt" and device is not None:
+            self.model = self.model.to(device=self.device)
+
+            hf_device_map = getattr(self.model, "hf_device_map", None)
+            if hf_device_map is not None:
+                logger.warning(
+                    "The model has been loaded with `accelerate` using `device_map=xxx` in `from_pretrained`"
+                    " method, you should not pass a device when initializing your pipeline."
+                )
+
+        if device is None and self.framework == "pt":
+            # `accelerate` device map
+            hf_device_map = getattr(self.model, "hf_device_map", None)
+            if hf_device_map is not None:
+                # Take the main device used by `accelerate`.
+                # adapted from: https://github.com/huggingface/transformers/pull/21479#issuecomment-1420833512
+                if set(hf_device_map.values()) == {"cpu"} or set(hf_device_map.values()) == {"cpu", "disk"}:
+                    accelerate_device = torch.device("cpu")
+                else:
+                    main_device = [d for d in hf_device_map.values() if d not in ["cpu", "disk"]][0]
+                    accelerate_device = torch.device(f"cuda:{main_device}")
+
+                self.device = accelerate_device
+            else:
+                self.device = torch.device("cpu")
 
         # Update config with task specific parameters
         task_specific_params = self.model.config.task_specific_params
@@ -1048,8 +1071,8 @@ def __call__(self, inputs, *args, num_workers=None, batch_size=None, **kwargs):
         self.call_count += 1
         if self.call_count > 10 and self.framework == "pt" and self.device.type == "cuda":
             warnings.warn(
-                "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a"
-                " dataset",
+                "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please"
+                " use a dataset",
                 UserWarning,
             )
 

diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py
@@ -14,7 +14,14 @@
 
 import unittest
 
-from transformers import MODEL_FOR_CAUSAL_LM_MAPPING, TF_MODEL_FOR_CAUSAL_LM_MAPPING, TextGenerationPipeline, pipeline
+from transformers import (
+    MODEL_FOR_CAUSAL_LM_MAPPING,
+    TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TextGenerationPipeline,
+    pipeline,
+)
 from transformers.testing_utils import (
     require_accelerate,
     require_tf,
@@ -312,3 +319,17 @@ def test_small_model_fp16(self):
 
         pipe = pipeline(model="hf-internal-testing/tiny-random-bloom", device=0, torch_dtype=torch.float16)
         pipe("This is a test")
+
+    @require_torch
+    @require_accelerate
+    @require_torch_gpu
+    def test_pipeline_accelerate_top_p(self):
+        import torch
+
+        model_id = "hf-internal-testing/tiny-random-bloom"
+
+        model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+        pipe("This is a test", do_sample=True, top_p=0.5)