From 53052d8dcb71d6136c3c4f93447d8ff3c32696e8 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Mon, 6 Feb 2023 19:29:05 +0000
Subject: [PATCH 01/15] v1 fix

---
 src/transformers/pipelines/base.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 3905d28d26d2..c20bb144a5cc 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -780,7 +780,9 @@ def __init__(
 
         # Special handling
         if self.framework == "pt" and self.device.type != "cpu":
-            self.model = self.model.to(self.device)
+            # there is no need to call `.to` on a model that has been loaded with  `accelerate`
+            if not hasattr(self.model, "hf_device_map"):
+                self.model = self.model.to(self.device)
 
         # Update config with task specific parameters
         task_specific_params = self.model.config.task_specific_params

From bdcfb266b84866b8346df9587eb57d9f3f4be135 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 8 Feb 2023 19:52:31 +0000
Subject: [PATCH 02/15] adapt from suggestions

---
 src/transformers/pipelines/base.py | 33 ++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index c20bb144a5cc..bc5470bac296 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -749,7 +749,7 @@ def __init__(
         framework: Optional[str] = None,
         task: str = "",
         args_parser: ArgumentHandler = None,
-        device: Union[int, str, "torch.device"] = -1,
+        device: Union[int, str, "torch.device"] = None,
         torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
         binary_output: bool = False,
         **kwargs,
@@ -769,7 +769,7 @@ def __init__(
                 self.device = device
             elif isinstance(device, str):
                 self.device = torch.device(device)
-            elif device < 0:
+            elif device is None or device < 0:
                 self.device = torch.device("cpu")
             else:
                 self.device = torch.device(f"cuda:{device}")
@@ -779,10 +779,31 @@ def __init__(
         self.binary_output = binary_output
 
         # Special handling
-        if self.framework == "pt" and self.device.type != "cpu":
-            # there is no need to call `.to` on a model that has been loaded with  `accelerate`
-            if not hasattr(self.model, "hf_device_map"):
-                self.model = self.model.to(self.device)
+        if self.framework == "pt" and device is not None:
+            self.model = self.model.to(device=device)
+
+            hf_device_map = getattr(self.model, "hf_device_map", None)
+            if hf_device_map is not None:
+                logger.warning(
+                    "The model has been loaded with `accelerate` using `device_map=xxx` in `from_pretrained`"
+                    " method, you should not pass a device when initializing your pipeline."
+                )
+
+        if device is None and self.framework == "pt":
+            # `accelerate` device map
+            hf_device_map = getattr(self.model, "hf_device_map", None)
+            if hf_device_map is not None:
+                # Take the main device used by `accelerate`.
+                # adapted from: https://github.com/huggingface/transformers/pull/21479#issuecomment-1420833512
+                if set(hf_device_map.values()) == {"cpu"} or set(hf_device_map.values()) == {"cpu", "disk"}:
+                    accelerate_device = torch.device("cpu")
+                else:
+                    main_device = [d for d in hf_device_map.values() if d not in ["cpu", "disk"]][0]
+                    accelerate_device = torch.device(f"cuda:{main_device}")
+                
+                self.device = accelerate_device
+            else:
+                self.device = torch.device("cpu")
 
         # Update config with task specific parameters
         task_specific_params = self.model.config.task_specific_params

From dab8d808300d34d7572e5f9d605b5fbfc5de57af Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Wed, 8 Feb 2023 19:54:21 +0000
Subject: [PATCH 03/15] make style

---
 src/transformers/pipelines/base.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index bc5470bac296..551309fcc86a 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -800,7 +800,7 @@ def __init__(
                 else:
                     main_device = [d for d in hf_device_map.values() if d not in ["cpu", "disk"]][0]
                     accelerate_device = torch.device(f"cuda:{main_device}")
-                
+
                 self.device = accelerate_device
             else:
                 self.device = torch.device("cpu")
@@ -1071,8 +1071,10 @@ def __call__(self, inputs, *args, num_workers=None, batch_size=None, **kwargs):
         self.call_count += 1
         if self.call_count > 10 and self.framework == "pt" and self.device.type == "cuda":
             warnings.warn(
-                "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a"
-                " dataset",
+                (
+                    "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please"
+                    " use a dataset"
+                ),
                 UserWarning,
             )
 

From 420940a5b36c2cfc81cb7e6f521e5942c06e8723 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 07:57:40 +0000
Subject: [PATCH 04/15] fix tests

---
 src/transformers/pipelines/base.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 551309fcc86a..85301f97e175 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -780,7 +780,7 @@ def __init__(
 
         # Special handling
         if self.framework == "pt" and device is not None:
-            self.model = self.model.to(device=device)
+            self.model = self.model.to(device=self.device)
 
             hf_device_map = getattr(self.model, "hf_device_map", None)
             if hf_device_map is not None:
@@ -1071,10 +1071,8 @@ def __call__(self, inputs, *args, num_workers=None, batch_size=None, **kwargs):
         self.call_count += 1
         if self.call_count > 10 and self.framework == "pt" and self.device.type == "cuda":
             warnings.warn(
-                (
-                    "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please"
-                    " use a dataset"
-                ),
+                "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please"
+                " use a dataset",
                 UserWarning,
             )
 

From 62cf7df817791bb67d0fd4e119384e1cd040f7e8 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 08:04:41 +0000
Subject: [PATCH 05/15] add gpu tests

---
 .../test_pipelines_text_generation.py         | 23 ++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py
index 2e97810e7101..763288bfaf98 100644
--- a/tests/pipelines/test_pipelines_text_generation.py
+++ b/tests/pipelines/test_pipelines_text_generation.py
@@ -14,7 +14,14 @@
 
 import unittest
 
-from transformers import MODEL_FOR_CAUSAL_LM_MAPPING, TF_MODEL_FOR_CAUSAL_LM_MAPPING, TextGenerationPipeline, pipeline
+from transformers import (
+    MODEL_FOR_CAUSAL_LM_MAPPING,
+    TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TextGenerationPipeline,
+    pipeline,
+)
 from transformers.testing_utils import (
     require_accelerate,
     require_tf,
@@ -312,3 +319,17 @@ def test_small_model_fp16(self):
 
         pipe = pipeline(model="hf-internal-testing/tiny-random-bloom", device=0, torch_dtype=torch.float16)
         pipe("This is a test")
+
+    @require_torch
+    @require_accelerate
+    @require_torch_gpu
+    def test_pipeline_accelerate_top_p(self):
+        import torch
+
+        model_id = "hf-internal-testing/tiny-random-bloom"
+
+        model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+        pipe("This is a test", do_sample=True, top_p=0.5)

From 9bbbaea4dfa7c90071850cfcb6ee02c97e1ca0a8 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 08:13:41 +0000
Subject: [PATCH 06/15] update docs

---
 docs/source/en/pipeline_tutorial.mdx | 37 +++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/pipeline_tutorial.mdx b/docs/source/en/pipeline_tutorial.mdx
index 8560d856f39e..f7fe9e7597bb 100644
--- a/docs/source/en/pipeline_tutorial.mdx
+++ b/docs/source/en/pipeline_tutorial.mdx
@@ -105,6 +105,8 @@ If the model is too large for a single GPU, you can set `device_map="auto"` to a
 generator(model="openai/whisper-large", device_map="auto")
 ```
 
+Note that if  `device_map="auto"` is passed, there is no need to add the argument `device=device` when instantiating your `pipeline` as you may encounter some unexpected behavior!
+
 ### Batch size
 
 By default, pipelines will not batch inference for reasons explained in detail [here](https://huggingface.co/docs/transformers/main_classes/pipelines#pipeline-batching). The reason is that batching is not necessarily faster, and can actually be quite slower in some cases.
@@ -257,4 +259,37 @@ sudo apt install -y tesseract-ocr
 pip install pytesseract
 ```
 
-</Tip>
\ No newline at end of file
+</Tip>
+
+## Using `pipeline` on large models with 🤗 `accelerate`:
+
+You can easily run `pipeline` on large models using 🤗 `accelerate`! First make sure you have installed `accelerate` with `pip install accelerate`. 
+
+Let's assume you fullfill the hardware requirements to run a large model such as `bloom` (which has 176B parameters, so ~350GB in `bfloat16`). First load your model
+using `device_map="auto"`
+
+```py
+# pip install accelerate
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+
+model = AutoModelForCausalLM.from_pretrained("bigscience/bloom", torch_dtype=torch.bfloat16, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom")
+
+pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
+```
+
+You can also pass 8-bit loaded models if you install `bitsandbytes` and add the argument `load_in_8bit=True`
+
+```py
+# pip install accelerate bitsandbytes
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+
+model = AutoModelForCausalLM.from_pretrained("bigscience/bloom", device_map="auto", load_in_8bit=True)
+tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom")
+
+pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
+```
\ No newline at end of file

From 8d730f8eab4344db23ae01fa752a1c29a68804cf Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 08:14:50 +0000
Subject: [PATCH 07/15] fix other tests

---
 src/transformers/pipelines/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 85301f97e175..586bfccb147e 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -774,7 +774,7 @@ def __init__(
             else:
                 self.device = torch.device(f"cuda:{device}")
         else:
-            self.device = device
+            self.device = device if device is not None else -1
         self.torch_dtype = torch_dtype
         self.binary_output = binary_output
 

From 80e50c9251eacf166066d161a752366d55f6387a Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 9 Feb 2023 18:15:19 +0100
Subject: [PATCH 08/15] Apply suggestions from code review

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 docs/source/en/pipeline_tutorial.mdx              | 14 ++++----------
 tests/pipelines/test_pipelines_text_generation.py |  7 +------
 2 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/docs/source/en/pipeline_tutorial.mdx b/docs/source/en/pipeline_tutorial.mdx
index f7fe9e7597bb..0171e19e91c5 100644
--- a/docs/source/en/pipeline_tutorial.mdx
+++ b/docs/source/en/pipeline_tutorial.mdx
@@ -271,12 +271,9 @@ using `device_map="auto"`
 ```py
 # pip install accelerate
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+from transformers import pipeline
 
-model = AutoModelForCausalLM.from_pretrained("bigscience/bloom", torch_dtype=torch.bfloat16, device_map="auto")
-tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom")
-
-pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+pipe = pipeline(model="bigscience/bloom", torch_dtype=torch.bfloat16, device_map="auto")
 output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
 ```
 
@@ -285,11 +282,8 @@ You can also pass 8-bit loaded models if you install `bitsandbytes` and add the
 ```py
 # pip install accelerate bitsandbytes
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-
-model = AutoModelForCausalLM.from_pretrained("bigscience/bloom", device_map="auto", load_in_8bit=True)
-tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom")
+from transformers import pipeline
 
-pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+pipe = pipeline(model="bigscience/bloom", device_map="auto", model_kwargs={"load_in_8bit":True})
 output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
 ```
\ No newline at end of file
diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py
index 763288bfaf98..71451250a9ee 100644
--- a/tests/pipelines/test_pipelines_text_generation.py
+++ b/tests/pipelines/test_pipelines_text_generation.py
@@ -326,10 +326,5 @@ def test_small_model_fp16(self):
     def test_pipeline_accelerate_top_p(self):
         import torch
 
-        model_id = "hf-internal-testing/tiny-random-bloom"
-
-        model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16)
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+        pipe = pipeline(model="hf-internal-testing/tiny-random-bloom", device_map="auto", torch_dtype=torch.float16)
         pipe("This is a test", do_sample=True, top_p=0.5)

From 8714b5effb98cc35f6b7161ee10a029adb3bc537 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 17:59:42 +0000
Subject: [PATCH 09/15] better fix

---
 src/transformers/pipelines/__init__.py |  5 +++
 src/transformers/pipelines/base.py     | 48 ++++++++++----------------
 2 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index e14d74457990..3d42d483b75d 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -741,6 +741,11 @@ def pipeline(
                 'You cannot use both `pipeline(... device_map=..., model_kwargs={"device_map":...})` as those'
                 " arguments might conflict, use only one.)"
             )
+        if device is not None:
+            logger.warning(
+                "Both `device` and `device_map` are specified. `device` will override `device_map`. You"
+                " will most likely encounter unexpected behavior. Please remove `device` and keep `device_map`."
+            )
         model_kwargs["device_map"] = device_map
     if torch_dtype is not None:
         if "torch_dtype" in model_kwargs:
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 586bfccb147e..d7037744dfdb 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -764,12 +764,25 @@ def __init__(
         self.image_processor = image_processor
         self.modelcard = modelcard
         self.framework = framework
+
+        if self.framework == "pt" and device is not None:
+            self.model = self.model.to(device=device)
+
+        if device is None:
+            # `accelerate` device map
+            hf_device_map = getattr(self.model, "hf_device_map", None)
+            if hf_device_map is not None:
+                # Take the first device used by `accelerate`.
+                device = next(iter(hf_device_map.values()))
+            else:
+                device = -1
+
         if is_torch_available() and self.framework == "pt":
             if isinstance(device, torch.device):
                 self.device = device
             elif isinstance(device, str):
                 self.device = torch.device(device)
-            elif device is None or device < 0:
+            elif device < 0:
                 self.device = torch.device("cpu")
             else:
                 self.device = torch.device(f"cuda:{device}")
@@ -778,33 +791,6 @@ def __init__(
         self.torch_dtype = torch_dtype
         self.binary_output = binary_output
 
-        # Special handling
-        if self.framework == "pt" and device is not None:
-            self.model = self.model.to(device=self.device)
-
-            hf_device_map = getattr(self.model, "hf_device_map", None)
-            if hf_device_map is not None:
-                logger.warning(
-                    "The model has been loaded with `accelerate` using `device_map=xxx` in `from_pretrained`"
-                    " method, you should not pass a device when initializing your pipeline."
-                )
-
-        if device is None and self.framework == "pt":
-            # `accelerate` device map
-            hf_device_map = getattr(self.model, "hf_device_map", None)
-            if hf_device_map is not None:
-                # Take the main device used by `accelerate`.
-                # adapted from: https://github.com/huggingface/transformers/pull/21479#issuecomment-1420833512
-                if set(hf_device_map.values()) == {"cpu"} or set(hf_device_map.values()) == {"cpu", "disk"}:
-                    accelerate_device = torch.device("cpu")
-                else:
-                    main_device = [d for d in hf_device_map.values() if d not in ["cpu", "disk"]][0]
-                    accelerate_device = torch.device(f"cuda:{main_device}")
-
-                self.device = accelerate_device
-            else:
-                self.device = torch.device("cpu")
-
         # Update config with task specific parameters
         task_specific_params = self.model.config.task_specific_params
         if task_specific_params is not None and task in task_specific_params:
@@ -1071,8 +1057,10 @@ def __call__(self, inputs, *args, num_workers=None, batch_size=None, **kwargs):
         self.call_count += 1
         if self.call_count > 10 and self.framework == "pt" and self.device.type == "cuda":
             warnings.warn(
-                "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please"
-                " use a dataset",
+                (
+                    "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please"
+                    " use a dataset"
+                ),
                 UserWarning,
             )
 

From e5b3dc0bf76280f642e22b63cf4df5b4721ce54f Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 18:01:19 +0000
Subject: [PATCH 10/15] make fixup

---
 src/transformers/pipelines/base.py                | 6 ++----
 tests/pipelines/test_pipelines_text_generation.py | 9 +--------
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index d7037744dfdb..ae3d88229c2c 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -1057,10 +1057,8 @@ def __call__(self, inputs, *args, num_workers=None, batch_size=None, **kwargs):
         self.call_count += 1
         if self.call_count > 10 and self.framework == "pt" and self.device.type == "cuda":
             warnings.warn(
-                (
-                    "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please"
-                    " use a dataset"
-                ),
+                "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please"
+                " use a dataset",
                 UserWarning,
             )
 
diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py
index 71451250a9ee..1f329926813f 100644
--- a/tests/pipelines/test_pipelines_text_generation.py
+++ b/tests/pipelines/test_pipelines_text_generation.py
@@ -14,14 +14,7 @@
 
 import unittest
 
-from transformers import (
-    MODEL_FOR_CAUSAL_LM_MAPPING,
-    TF_MODEL_FOR_CAUSAL_LM_MAPPING,
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    TextGenerationPipeline,
-    pipeline,
-)
+from transformers import MODEL_FOR_CAUSAL_LM_MAPPING, TF_MODEL_FOR_CAUSAL_LM_MAPPING, TextGenerationPipeline, pipeline
 from transformers.testing_utils import (
     require_accelerate,
     require_tf,

From adf3ca4d8d147286fa8890854b320e260a6a269a Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 18:03:58 +0000
Subject: [PATCH 11/15] better example

---
 docs/source/en/pipeline_tutorial.mdx | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/pipeline_tutorial.mdx b/docs/source/en/pipeline_tutorial.mdx
index 0171e19e91c5..00dceeb4f243 100644
--- a/docs/source/en/pipeline_tutorial.mdx
+++ b/docs/source/en/pipeline_tutorial.mdx
@@ -265,15 +265,14 @@ pip install pytesseract
 
 You can easily run `pipeline` on large models using 🤗 `accelerate`! First make sure you have installed `accelerate` with `pip install accelerate`. 
 
-Let's assume you fullfill the hardware requirements to run a large model such as `bloom` (which has 176B parameters, so ~350GB in `bfloat16`). First load your model
-using `device_map="auto"`
+First load your model using `device_map="auto"`! We will use `facebook/opt-1.3b` for our example.
 
 ```py
 # pip install accelerate
 import torch
 from transformers import pipeline
 
-pipe = pipeline(model="bigscience/bloom", torch_dtype=torch.bfloat16, device_map="auto")
+pipe = pipeline(model="facebook/opt-1.3b", torch_dtype=torch.bfloat16, device_map="auto")
 output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
 ```
 
@@ -284,6 +283,8 @@ You can also pass 8-bit loaded models if you install `bitsandbytes` and add the
 import torch
 from transformers import pipeline
 
-pipe = pipeline(model="bigscience/bloom", device_map="auto", model_kwargs={"load_in_8bit":True})
+pipe = pipeline(model="facebook/opt-1.3b", device_map="auto", model_kwargs={"load_in_8bit": True})
 output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
-```
\ No newline at end of file
+```
+
+Note that you can replace the checkpoint with any of the Hugging Face model that supports large model loading such as BLOOM!
\ No newline at end of file

From 6bea43240e1961e6ad1d0391af6d69b574d994e0 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 18:05:54 +0000
Subject: [PATCH 12/15] revert changes

---
 src/transformers/pipelines/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index ae3d88229c2c..80248173487c 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -1057,8 +1057,8 @@ def __call__(self, inputs, *args, num_workers=None, batch_size=None, **kwargs):
         self.call_count += 1
         if self.call_count > 10 and self.framework == "pt" and self.device.type == "cuda":
             warnings.warn(
-                "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please"
-                " use a dataset",
+                "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a"
+                " dataset",
                 UserWarning,
             )
 

From e57d8f81feba281b2587681c0e5dc627c0f19623 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 18:37:29 +0000
Subject: [PATCH 13/15] proposal

---
 src/transformers/pipelines/base.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 80248173487c..c8870b44f87f 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -766,6 +766,8 @@ def __init__(
         self.framework = framework
 
         if self.framework == "pt" and device is not None:
+            if isinstance(device, int) and device == -1:
+                device = "cpu"
             self.model = self.model.to(device=device)
 
         if device is None:

From 23f0608c2a12a3b62010375ab801f6aede50c6c7 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Thu, 9 Feb 2023 18:56:05 +0000
Subject: [PATCH 14/15] more elegant solution

---
 src/transformers/pipelines/automatic_speech_recognition.py | 6 +++---
 src/transformers/pipelines/base.py                         | 2 --
 src/transformers/pipelines/question_answering.py           | 2 --
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index 8c552cbdc307..ade6a52b5ff0 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -287,9 +287,9 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
             installed. If no framework is specified, will default to the one currently installed. If no framework is
             specified and both frameworks are installed, will default to the framework of the `model`, or to PyTorch if
             no model is provided.
-        device (`int`, *optional*, defaults to -1):
-            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
-            the associated CUDA device id.
+        device (Union[`int`, `torch.device`], *optional*, defaults to `None`):
+            Device ordinal for CPU/GPU supports. Setting this to `None` will leverage CPU, a positive will run the
+            model on the associated CUDA device id.
         decoder (`pyctcdecode.BeamSearchDecoderCTC`, *optional*):
             [PyCTCDecode's
             BeamSearchDecoderCTC](https://github.com/kensho-technologies/pyctcdecode/blob/2fd33dc37c4111417e08d89ccd23d28e9b308d19/pyctcdecode/decoder.py#L180)
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index c8870b44f87f..80248173487c 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -766,8 +766,6 @@ def __init__(
         self.framework = framework
 
         if self.framework == "pt" and device is not None:
-            if isinstance(device, int) and device == -1:
-                device = "cpu"
             self.model = self.model.to(device=device)
 
         if device is None:
diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py
index d4bb7f210290..fad64d71ff71 100644
--- a/src/transformers/pipelines/question_answering.py
+++ b/src/transformers/pipelines/question_answering.py
@@ -255,7 +255,6 @@ def __init__(
         tokenizer: PreTrainedTokenizer,
         modelcard: Optional[ModelCard] = None,
         framework: Optional[str] = None,
-        device: int = -1,
         task: str = "",
         **kwargs,
     ):
@@ -264,7 +263,6 @@ def __init__(
             tokenizer=tokenizer,
             modelcard=modelcard,
             framework=framework,
-            device=device,
             task=task,
             **kwargs,
         )

From e80be11d51f5b2deac6cf6e6f2476c3a8f930d3a Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 9 Feb 2023 20:02:47 +0100
Subject: [PATCH 15/15] Update
 src/transformers/pipelines/automatic_speech_recognition.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/pipelines/automatic_speech_recognition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index ade6a52b5ff0..5075fa6c56e6 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -287,7 +287,7 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
             installed. If no framework is specified, will default to the one currently installed. If no framework is
             specified and both frameworks are installed, will default to the framework of the `model`, or to PyTorch if
             no model is provided.
-        device (Union[`int`, `torch.device`], *optional*, defaults to `None`):
+        device (Union[`int`, `torch.device`], *optional*):
             Device ordinal for CPU/GPU supports. Setting this to `None` will leverage CPU, a positive will run the
             model on the associated CUDA device id.
         decoder (`pyctcdecode.BeamSearchDecoderCTC`, *optional*):