diff --git a/docs/_src/api/api/document_classifier.md b/docs/_src/api/api/document_classifier.md
index 375c3baa2d..629bbb5684 100644
--- a/docs/_src/api/api/document_classifier.md
+++ b/docs/_src/api/api/document_classifier.md
@@ -84,7 +84,7 @@ With this document_classifier, you can directly get predictions via predict()
#### TransformersDocumentClassifier.\_\_init\_\_
```python
-def __init__(model_name_or_path: str = "bhadresh-savani/distilbert-base-uncased-emotion", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, return_all_scores: bool = False, task: str = "text-classification", labels: Optional[List[str]] = None, batch_size: int = 16, classification_field: str = None, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None)
+def __init__(model_name_or_path: str = "bhadresh-savani/distilbert-base-uncased-emotion", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, return_all_scores: bool = False, task: str = "text-classification", labels: Optional[List[str]] = None, batch_size: int = 16, classification_field: str = None, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
```
Load a text classification model from Transformers.
@@ -122,6 +122,10 @@ If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md
index 2fde44aef6..53aa056f78 100644
--- a/docs/_src/api/api/document_store.md
+++ b/docs/_src/api/api/document_store.md
@@ -1652,7 +1652,7 @@ In-memory document store
#### InMemoryDocumentStore.\_\_init\_\_
```python
-def __init__(index: str = "document", label_index: str = "label", embedding_field: Optional[str] = "embedding", embedding_dim: int = 768, return_embedding: bool = False, similarity: str = "dot_product", progress_bar: bool = True, duplicate_documents: str = "overwrite", use_gpu: bool = True, scoring_batch_size: int = 500000)
+def __init__(index: str = "document", label_index: str = "label", embedding_field: Optional[str] = "embedding", embedding_dim: int = 768, return_embedding: bool = False, similarity: str = "dot_product", progress_bar: bool = True, duplicate_documents: str = "overwrite", use_gpu: bool = True, scoring_batch_size: int = 500000, devices: Optional[List[Union[str, torch.device]]] = None)
```
**Arguments**:
@@ -1680,6 +1680,10 @@ Very large batch sizes can overrun GPU memory. In general you want to make sure
you have at least `embedding_dim`*`scoring_batch_size`*4 bytes available in GPU memory.
Since the data is originally stored in CPU memory there is little risk of overruning memory
when running on CPU.
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
diff --git a/docs/_src/api/api/extractor.md b/docs/_src/api/api/extractor.md
index 67dc56b4ba..339fa8069c 100644
--- a/docs/_src/api/api/extractor.md
+++ b/docs/_src/api/api/extractor.md
@@ -29,6 +29,10 @@ If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
diff --git a/docs/_src/api/api/generator.md b/docs/_src/api/api/generator.md
index 0ff3cf4f15..58fbbd2b9f 100644
--- a/docs/_src/api/api/generator.md
+++ b/docs/_src/api/api/generator.md
@@ -138,7 +138,7 @@ i.e. the model can easily adjust to domain documents even after training has fin
#### RAGenerator.\_\_init\_\_
```python
-def __init__(model_name_or_path: str = "facebook/rag-token-nq", model_version: Optional[str] = None, retriever: Optional[DensePassageRetriever] = None, generator_type: str = "token", top_k: int = 2, max_length: int = 200, min_length: int = 2, num_beams: int = 2, embed_title: bool = True, prefix: Optional[str] = None, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None)
+def __init__(model_name_or_path: str = "facebook/rag-token-nq", model_version: Optional[str] = None, retriever: Optional[DensePassageRetriever] = None, generator_type: str = "token", top_k: int = 2, max_length: int = 200, min_length: int = 2, num_beams: int = 2, embed_title: bool = True, prefix: Optional[str] = None, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
```
Load a RAG model from Transformers along with passage_embedding_model.
@@ -166,6 +166,10 @@ If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
@@ -262,7 +266,7 @@ the [Hugging Face Model Hub](https://huggingface.co/models?pipeline_tag=text2tex
#### Seq2SeqGenerator.\_\_init\_\_
```python
-def __init__(model_name_or_path: str, input_converter: Optional[Callable] = None, top_k: int = 1, max_length: int = 200, min_length: int = 2, num_beams: int = 8, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None)
+def __init__(model_name_or_path: str, input_converter: Optional[Callable] = None, top_k: int = 1, max_length: int = 200, min_length: int = 2, num_beams: int = 8, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
```
**Arguments**:
@@ -284,6 +288,10 @@ If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
diff --git a/docs/_src/api/api/pseudo_label_generator.md b/docs/_src/api/api/pseudo_label_generator.md
index d8fa9a4c19..757f14b5f7 100644
--- a/docs/_src/api/api/pseudo_label_generator.md
+++ b/docs/_src/api/api/pseudo_label_generator.md
@@ -53,7 +53,7 @@ For example:
#### PseudoLabelGenerator.\_\_init\_\_
```python
-def __init__(question_producer: Union[QuestionGenerator, List[Dict[str, str]]], retriever: BaseRetriever, cross_encoder_model_name_or_path: str = "cross-encoder/ms-marco-MiniLM-L-6-v2", max_questions_per_document: int = 3, top_k: int = 50, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None)
+def __init__(question_producer: Union[QuestionGenerator, List[Dict[str, str]]], retriever: BaseRetriever, cross_encoder_model_name_or_path: str = "cross-encoder/ms-marco-MiniLM-L-6-v2", max_questions_per_document: int = 3, top_k: int = 50, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None)
```
Loads the cross-encoder model and prepares PseudoLabelGenerator.
@@ -74,6 +74,10 @@ If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit CrossEncoder inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
diff --git a/docs/_src/api/api/query_classifier.md b/docs/_src/api/api/query_classifier.md
index 81b89c373e..45df8f2c2a 100644
--- a/docs/_src/api/api/query_classifier.md
+++ b/docs/_src/api/api/query_classifier.md
@@ -144,7 +144,7 @@ This node also supports zero-shot-classification.
#### TransformersQueryClassifier.\_\_init\_\_
```python
-def __init__(model_name_or_path: Union[Path, str] = "shahrukhx01/bert-mini-finetune-question-detection", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, task: str = "text-classification", labels: List[str] = DEFAULT_LABELS, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None)
+def __init__(model_name_or_path: Union[Path, str] = "shahrukhx01/bert-mini-finetune-question-detection", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, task: str = "text-classification", labels: List[str] = DEFAULT_LABELS, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
```
**Arguments**:
@@ -165,4 +165,8 @@ If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
diff --git a/docs/_src/api/api/question_generator.md b/docs/_src/api/api/question_generator.md
index e8ab9f4a4d..c5bfc32363 100644
--- a/docs/_src/api/api/question_generator.md
+++ b/docs/_src/api/api/question_generator.md
@@ -23,7 +23,7 @@ come from earlier in the document.
#### QuestionGenerator.\_\_init\_\_
```python
-def __init__(model_name_or_path="valhalla/t5-base-e2e-qg", model_version=None, num_beams=4, max_length=256, no_repeat_ngram_size=3, length_penalty=1.5, early_stopping=True, split_length=50, split_overlap=10, use_gpu=True, prompt="generate questions:", num_queries_per_doc=1, sep_token: str = "", batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None)
+def __init__(model_name_or_path="valhalla/t5-base-e2e-qg", model_version=None, num_beams=4, max_length=256, no_repeat_ngram_size=3, length_penalty=1.5, early_stopping=True, split_length=50, split_overlap=10, use_gpu=True, prompt="generate questions:", num_queries_per_doc=1, sep_token: str = "", batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
```
Uses the valhalla/t5-base-e2e-qg model by default. This class supports any question generation model that is
@@ -45,6 +45,10 @@ If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
diff --git a/docs/_src/api/api/ranker.md b/docs/_src/api/api/ranker.md
index 912e2fc7b3..41e1788b08 100644
--- a/docs/_src/api/api/ranker.md
+++ b/docs/_src/api/api/ranker.md
@@ -105,10 +105,6 @@ See https://huggingface.co/cross-encoder for full list of available models
- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
- `top_k`: The maximum number of documents to return
- `use_gpu`: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available.
-- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
-The strings will be converted into pytorch devices, so use the string notation described here:
-https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
-(e.g. ["cuda:0"]).
- `batch_size`: Number of documents to process at a time.
- `scale_score`: The raw predictions will be transformed using a Sigmoid activation function in case the model
only predicts a single label. For multi-label predictions, no scaling is applied. Set this
@@ -119,6 +115,10 @@ If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
diff --git a/docs/_src/api/api/reader.md b/docs/_src/api/api/reader.md
index b163a9e45a..19cfc35189 100644
--- a/docs/_src/api/api/reader.md
+++ b/docs/_src/api/api/reader.md
@@ -45,7 +45,7 @@ While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interf
#### FARMReader.\_\_init\_\_
```python
-def __init__(model_name_or_path: str, model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, devices: List[torch.device] = [], no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0, use_confidence_scores: bool = True, confidence_threshold: Optional[float] = None, proxies: Optional[Dict[str, str]] = None, local_files_only=False, force_download=False, use_auth_token: Optional[Union[str, bool]] = None)
+def __init__(model_name_or_path: str, model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0, use_confidence_scores: bool = True, confidence_threshold: Optional[float] = None, proxies: Optional[Dict[str, str]] = None, local_files_only=False, force_download=False, use_auth_token: Optional[Union[str, bool]] = None)
```
**Arguments**:
@@ -60,8 +60,10 @@ displaying the context around the answer.
Memory consumption is much lower in inference mode. Recommendation: Increase the batch size
to a value so only a single batch is used.
- `use_gpu`: Whether to use GPUs or the CPU. Falls back on CPU if no GPU is available.
-- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]).
-Unused if `use_gpu` is False.
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
- `no_ans_boost`: How much the no_answer logit is boosted/increased.
If set to 0 (default), the no_answer logit is not changed.
If a negative number, there is a lower chance of "no_answer" being predicted.
@@ -131,8 +133,10 @@ If any checkpoints are stored, a subsequent run of train() will resume training
- `dev_split`: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here
that gets split off from training data for eval.
- `use_gpu`: Whether to use GPU (if available)
-- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]).
-Unused if `use_gpu` is False.
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
- `batch_size`: Number of samples the model receives in one batch for training
- `n_epochs`: Number of iterations on the whole training data set
- `learning_rate`: Learning rate of the optimizer
@@ -202,8 +206,10 @@ If any checkpoints are stored, a subsequent run of train() will resume training
- `dev_split`: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here
that gets split off from training data for eval.
- `use_gpu`: Whether to use GPU (if available)
-- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]).
-Unused if `use_gpu` is False.
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
- `student_batch_size`: Number of samples the student model receives in one batch for training
- `student_batch_size`: Number of samples the teacher model receives in one batch for distillation
- `n_epochs`: Number of iterations on the whole training data set
@@ -278,8 +284,10 @@ If any checkpoints are stored, a subsequent run of train() will resume training
- `dev_split`: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here
that gets split off from training data for eval.
- `use_gpu`: Whether to use GPU (if available)
-- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]).
-Unused if `use_gpu` is False.
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
- `student_batch_size`: Number of samples the student model receives in one batch for training
- `student_batch_size`: Number of samples the teacher model receives in one batch for distillation
- `n_epochs`: Number of iterations on the whole training data set
@@ -589,7 +597,7 @@ With this reader, you can directly get predictions via predict()
#### TransformersReader.\_\_init\_\_
```python
-def __init__(model_name_or_path: str = "distilbert-base-uncased-distilled-squad", model_version: Optional[str] = None, tokenizer: Optional[str] = None, context_window_size: int = 70, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 3, return_no_answers: bool = False, max_seq_len: int = 256, doc_stride: int = 128, batch_size: int = 16, use_auth_token: Optional[Union[str, bool]] = None)
+def __init__(model_name_or_path: str = "distilbert-base-uncased-distilled-squad", model_version: Optional[str] = None, tokenizer: Optional[str] = None, context_window_size: int = 70, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 3, return_no_answers: bool = False, max_seq_len: int = 256, doc_stride: int = 128, batch_size: int = 16, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
```
Load a QA model from Transformers.
@@ -628,6 +636,10 @@ If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
@@ -739,7 +751,7 @@ answer = prediction["answers"][0].answer # "10 june 1996"
#### TableReader.\_\_init\_\_
```python
-def __init__(model_name_or_path: str = "google/tapas-base-finetuned-wtq", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 3, return_no_answer: bool = False, max_seq_len: int = 256, use_auth_token: Optional[Union[str, bool]] = None)
+def __init__(model_name_or_path: str = "google/tapas-base-finetuned-wtq", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 3, return_no_answer: bool = False, max_seq_len: int = 256, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
```
Load a TableQA model from Transformers.
@@ -780,6 +792,10 @@ If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md
index 85334930b9..318e35940d 100644
--- a/docs/_src/api/api/retriever.md
+++ b/docs/_src/api/api/retriever.md
@@ -567,10 +567,11 @@ Options: `dot_product` (Default) or `cosine`
Increase if errors like "encoded data exceeds max_size ..." come up
- `progress_bar`: Whether to show a tqdm progress bar or not.
Can be helpful to disable in production deployments to keep the logs clean.
-- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
-These strings will be converted into pytorch devices, so use the string notation described here:
-https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
-(e.g. ["cuda:0"]). Note: as multi-GPU training is currently not implemented for DPR, training
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
+Note: as multi-GPU training is currently not implemented for DPR, training
will only use the first device provided in this list.
- `use_auth_token`: The API token used to download private models from Huggingface.
If this parameter is set to `True`, then the token generated when running
@@ -934,10 +935,11 @@ Options: `dot_product` (Default) or `cosine`
Increase if errors like "encoded data exceeds max_size ..." come up
- `progress_bar`: Whether to show a tqdm progress bar or not.
Can be helpful to disable in production deployments to keep the logs clean.
-- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
-These strings will be converted into pytorch devices, so use the string notation described here:
-https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
-(e.g. ["cuda:0"]). Note: as multi-GPU training is currently not implemented for TableTextRetriever,
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
+Note: as multi-GPU training is currently not implemented for TableTextRetriever,
training will only use the first device provided in this list.
- `use_auth_token`: The API token used to download private models from Huggingface.
If this parameter is set to `True`, then the token generated when running
@@ -1212,10 +1214,11 @@ Options:
Default: -1 (very last layer).
- `top_k`: How many documents to return per query.
- `progress_bar`: If true displays progress bar during embedding.
-- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
-These strings will be converted into pytorch devices, so use the string notation described here:
-https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
-(e.g. ["cuda:0"]). Note: As multi-GPU training is currently not implemented for EmbeddingRetriever,
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
+Note: As multi-GPU training is currently not implemented for EmbeddingRetriever,
training will only use the first device provided in this list.
- `use_auth_token`: The API token used to download private models from Huggingface.
If this parameter is set to `True`, then the token generated when running
@@ -1535,10 +1538,11 @@ Options:
Default: -1 (very last layer).
- `top_k`: How many documents to return per query.
- `progress_bar`: If true displays progress bar during embedding.
-- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
-These strings will be converted into pytorch devices, so use the string notation described here:
-https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
-(e.g. ["cuda:0"]). Note: As multi-GPU training is currently not implemented for EmbeddingRetriever,
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
+Note: As multi-GPU training is currently not implemented for EmbeddingRetriever,
training will only use the first device provided in this list.
- `use_auth_token`: The API token used to download private models from Huggingface.
If this parameter is set to `True`, then the token generated when running
diff --git a/docs/_src/api/api/summarizer.md b/docs/_src/api/api/summarizer.md
index f5a5a23c5e..d76878f788 100644
--- a/docs/_src/api/api/summarizer.md
+++ b/docs/_src/api/api/summarizer.md
@@ -87,7 +87,7 @@ See the up-to-date list of available models on
#### TransformersSummarizer.\_\_init\_\_
```python
-def __init__(model_name_or_path: str = "google/pegasus-xsum", model_version: Optional[str] = None, tokenizer: Optional[str] = None, max_length: int = 200, min_length: int = 5, use_gpu: bool = True, clean_up_tokenization_spaces: bool = True, separator_for_single_summary: str = " ", generate_single_summary: bool = False, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None)
+def __init__(model_name_or_path: str = "google/pegasus-xsum", model_version: Optional[str] = None, tokenizer: Optional[str] = None, max_length: int = 200, min_length: int = 5, use_gpu: bool = True, clean_up_tokenization_spaces: bool = True, separator_for_single_summary: str = " ", generate_single_summary: bool = False, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
```
Load a Summarization model from Transformers.
@@ -119,6 +119,10 @@ If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
diff --git a/docs/_src/api/api/translator.md b/docs/_src/api/api/translator.md
index f93d961e2b..8f2ddc66a7 100644
--- a/docs/_src/api/api/translator.md
+++ b/docs/_src/api/api/translator.md
@@ -68,7 +68,7 @@ We currently recommend using OPUS models (see __init__() for details)
#### TransformersTranslator.\_\_init\_\_
```python
-def __init__(model_name_or_path: str, tokenizer_name: Optional[str] = None, max_seq_len: Optional[int] = None, clean_up_tokenization_spaces: Optional[bool] = True, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None)
+def __init__(model_name_or_path: str, tokenizer_name: Optional[str] = None, max_seq_len: Optional[int] = None, clean_up_tokenization_spaces: Optional[bool] = True, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
```
Initialize the translator with a model that fits your targeted languages. While we support all seq2seq
@@ -99,6 +99,10 @@ If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+A list containing torch device objects and/or strings is supported (For example
+[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+parameter is not used and a single cpu device is used for inference.
diff --git a/haystack/document_stores/memory.py b/haystack/document_stores/memory.py
index 760df00ccc..e18b2d88e7 100644
--- a/haystack/document_stores/memory.py
+++ b/haystack/document_stores/memory.py
@@ -39,6 +39,7 @@ def __init__(
duplicate_documents: str = "overwrite",
use_gpu: bool = True,
scoring_batch_size: int = 500000,
+ devices: Optional[List[Union[str, torch.device]]] = None,
):
"""
:param index: The documents are scoped to an index attribute that can be used when writing, querying,
@@ -64,6 +65,10 @@ def __init__(
you have at least `embedding_dim`*`scoring_batch_size`*4 bytes available in GPU memory.
Since the data is originally stored in CPU memory there is little risk of overruning memory
when running on CPU.
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
"""
super().__init__()
@@ -79,7 +84,13 @@ def __init__(
self.use_gpu = use_gpu
self.scoring_batch_size = scoring_batch_size
- self.devices, _ = initialize_device_settings(use_cuda=self.use_gpu)
+ self.devices, _ = initialize_device_settings(devices=devices, use_cuda=self.use_gpu, multi_gpu=False)
+ if len(self.devices) > 1:
+ logger.warning(
+ f"Multiple devices are not supported in {self.__class__.__name__} inference, "
+ f"using the first device {self.devices[0]}."
+ )
+
self.main_device = self.devices[0]
def write_documents(
diff --git a/haystack/json-schemas/haystack-pipeline-main.schema.json b/haystack/json-schemas/haystack-pipeline-main.schema.json
index dd2e76d6bc..a6d96d5f75 100644
--- a/haystack/json-schemas/haystack-pipeline-main.schema.json
+++ b/haystack/json-schemas/haystack-pipeline-main.schema.json
@@ -960,6 +960,27 @@
"title": "Scoring Batch Size",
"default": 500000,
"type": "integer"
+ },
+ "devices": {
+ "title": "Devices",
+ "anyOf": [
+ {
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ {
+ "type": "null"
+ }
+ ]
}
},
"additionalProperties": false,
@@ -2056,6 +2077,27 @@
"title": "Progress Bar",
"default": true,
"type": "boolean"
+ },
+ "devices": {
+ "title": "Devices",
+ "anyOf": [
+ {
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ {
+ "type": "null"
+ }
+ ]
}
},
"additionalProperties": false,
@@ -3000,6 +3042,27 @@
"type": "null"
}
]
+ },
+ "devices": {
+ "title": "Devices",
+ "anyOf": [
+ {
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ {
+ "type": "null"
+ }
+ ]
}
},
"additionalProperties": false,
@@ -3154,11 +3217,24 @@
},
"devices": {
"title": "Devices",
- "default": [],
- "type": "array",
- "items": {
- "type": "string"
- }
+ "anyOf": [
+ {
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ {
+ "type": "null"
+ }
+ ]
},
"no_ans_boost": {
"title": "No Ans Boost",
@@ -4340,6 +4416,32 @@
"type": "null"
}
]
+ },
+ "use_gpu": {
+ "title": "Use Gpu",
+ "default": true,
+ "type": "boolean"
+ },
+ "devices": {
+ "title": "Devices",
+ "anyOf": [
+ {
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ {
+ "type": "null"
+ }
+ ]
}
},
"required": [
@@ -4449,6 +4551,27 @@
"type": "null"
}
]
+ },
+ "devices": {
+ "title": "Devices",
+ "anyOf": [
+ {
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ {
+ "type": "null"
+ }
+ ]
}
},
"additionalProperties": false,
@@ -4571,6 +4694,27 @@
"type": "null"
}
]
+ },
+ "devices": {
+ "title": "Devices",
+ "anyOf": [
+ {
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ {
+ "type": "null"
+ }
+ ]
}
},
"additionalProperties": false,
@@ -4935,6 +5079,27 @@
"type": "null"
}
]
+ },
+ "devices": {
+ "title": "Devices",
+ "anyOf": [
+ {
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ {
+ "type": "null"
+ }
+ ]
}
},
"required": [
@@ -5098,6 +5263,27 @@
"type": "null"
}
]
+ },
+ "devices": {
+ "title": "Devices",
+ "anyOf": [
+ {
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ {
+ "type": "null"
+ }
+ ]
}
},
"additionalProperties": false,
@@ -5641,6 +5827,27 @@
"type": "null"
}
]
+ },
+ "devices": {
+ "title": "Devices",
+ "anyOf": [
+ {
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ {
+ "type": "null"
+ }
+ ]
}
},
"additionalProperties": false,
@@ -5750,6 +5957,27 @@
"type": "null"
}
]
+ },
+ "devices": {
+ "title": "Devices",
+ "anyOf": [
+ {
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ {
+ "type": "null"
+ }
+ ]
}
},
"additionalProperties": false,
@@ -5860,6 +6088,27 @@
"type": "null"
}
]
+ },
+ "devices": {
+ "title": "Devices",
+ "anyOf": [
+ {
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ {
+ "type": "null"
+ }
+ ]
}
},
"additionalProperties": false,
@@ -5970,6 +6219,27 @@
"type": "null"
}
]
+ },
+ "devices": {
+ "title": "Devices",
+ "anyOf": [
+ {
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ {
+ "type": "null"
+ }
+ ]
}
},
"additionalProperties": false,
@@ -6061,6 +6331,27 @@
"type": "null"
}
]
+ },
+ "devices": {
+ "title": "Devices",
+ "anyOf": [
+ {
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "string"
+ }
+ ]
+ }
+ },
+ {
+ "type": "null"
+ }
+ ]
}
},
"required": [
diff --git a/haystack/modeling/infer.py b/haystack/modeling/infer.py
index 44c766081d..4542b49462 100644
--- a/haystack/modeling/infer.py
+++ b/haystack/modeling/infer.py
@@ -46,6 +46,7 @@ def __init__(
extraction_layer: Optional[int] = None,
num_processes: Optional[int] = None,
disable_tqdm: bool = False,
+ devices: Optional[List[Union[str, torch.device]]] = None,
):
"""
Initializes Inferencer from an AdaptiveModel and a Processor instance.
@@ -70,11 +71,20 @@ def __init__(
:func:`~farm.infer.Inferencer.close_multiprocessing_pool` after you are
done using this class. The garbage collector will not do this for you!
:param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing)
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
:return: An instance of the Inferencer.
"""
# Init device and distributed settings
- self.devices, n_gpu = initialize_device_settings(use_cuda=gpu, multi_gpu=False)
+ self.devices, n_gpu = initialize_device_settings(devices=devices, use_cuda=gpu, multi_gpu=False)
+ if len(self.devices) > 1:
+ logger.warning(
+ f"Multiple devices are not supported in {self.__class__.__name__} inference, "
+ f"using the first device {self.devices[0]}."
+ )
self.processor = processor
self.model = model
@@ -125,8 +135,8 @@ def load(
use_fast: bool = True,
tokenizer_args: Dict = None,
multithreading_rust: bool = True,
- devices: Optional[List[torch.device]] = None,
use_auth_token: Optional[Union[bool, str]] = None,
+ devices: Optional[List[Union[str, torch.device]]] = None,
**kwargs,
):
"""
@@ -177,8 +187,11 @@ def load(
if tokenizer_args is None:
tokenizer_args = {}
- if devices is None:
- devices, n_gpu = initialize_device_settings(use_cuda=gpu, multi_gpu=False)
+ devices, n_gpu = initialize_device_settings(devices=devices, use_cuda=gpu, multi_gpu=False)
+ if len(devices) > 1:
+ logger.warning(
+ f"Multiple devices are not supported in Inferencer, " f"using the first device {devices[0]}."
+ )
name = os.path.basename(model_name_or_path)
@@ -243,6 +256,7 @@ def load(
extraction_layer=extraction_layer,
num_processes=num_processes,
disable_tqdm=disable_tqdm,
+ devices=devices,
)
def _set_multiprocessing_pool(self, num_processes: Optional[int]) -> None:
diff --git a/haystack/modeling/utils.py b/haystack/modeling/utils.py
index 998ab573cf..35d17d08a4 100644
--- a/haystack/modeling/utils.py
+++ b/haystack/modeling/utils.py
@@ -1,4 +1,4 @@
-from typing import Tuple, List, Optional
+from typing import Tuple, List, Optional, Union
import logging
import os
@@ -52,7 +52,7 @@ def initialize_device_settings(
use_cuda: Optional[bool] = None,
local_rank: int = -1,
multi_gpu: bool = True,
- devices: Optional[List[torch.device]] = None,
+ devices: List[Union[str, torch.device]] = None,
) -> Tuple[List[torch.device], int]:
"""
Returns a list of available devices.
@@ -62,14 +62,23 @@ def initialize_device_settings(
Unused if `devices` is set or `use_cuda` is False.
:param multi_gpu: Whether to make use of all GPUs (if available).
Unused if `devices` is set or `use_cuda` is False.
- :param devices: an explicit list of which GPUs to use. Unused if `use_cuda` is False.
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
"""
if use_cuda is False: # Note that it could be None, in which case we also want to just skip this step.
devices_to_use = [torch.device("cpu")]
n_gpu = 0
elif devices:
- devices_to_use = devices
- n_gpu = sum(1 for device in devices if "cpu" not in device.type)
+ if not isinstance(devices, list):
+ raise ValueError(f"devices must be a list, but got {devices} of type {type(devices)}")
+ if any(isinstance(device, str) for device in devices):
+ torch_devices: List[torch.device] = [torch.device(device) for device in devices]
+ devices_to_use = torch_devices
+ else:
+ devices_to_use = devices
+ n_gpu = sum(1 for device in devices_to_use if "cpu" not in device.type)
elif local_rank == -1:
if torch.cuda.is_available():
if multi_gpu:
diff --git a/haystack/nodes/answer_generator/transformers.py b/haystack/nodes/answer_generator/transformers.py
index 5387c058ba..54293ce789 100644
--- a/haystack/nodes/answer_generator/transformers.py
+++ b/haystack/nodes/answer_generator/transformers.py
@@ -80,6 +80,7 @@ def __init__(
use_gpu: bool = True,
progress_bar: bool = True,
use_auth_token: Optional[Union[str, bool]] = None,
+ devices: Optional[List[Union[str, torch.device]]] = None,
):
"""
Load a RAG model from Transformers along with passage_embedding_model.
@@ -104,6 +105,11 @@ def __init__(
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
"""
super().__init__(progress_bar=progress_bar)
@@ -122,7 +128,12 @@ def __init__(
self.top_k = top_k
- self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False)
+ self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
+ if len(self.devices) > 1:
+ logger.warning(
+ f"Multiple devices are not supported in {self.__class__.__name__} inference, "
+ f"using the first device {self.devices[0]}."
+ )
self.tokenizer = RagTokenizer.from_pretrained(model_name_or_path, use_auth_token=use_auth_token)
@@ -338,6 +349,7 @@ def __init__(
use_gpu: bool = True,
progress_bar: bool = True,
use_auth_token: Optional[Union[str, bool]] = None,
+ devices: Optional[List[Union[str, torch.device]]] = None,
):
"""
:param model_name_or_path: a HF model name for auto-regressive language model like GPT2, XLNet, XLM, Bart, T5 etc
@@ -357,6 +369,10 @@ def __init__(
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
"""
super().__init__(progress_bar=progress_bar)
self.model_name_or_path = model_name_or_path
@@ -370,7 +386,12 @@ def __init__(
self.top_k = top_k
- self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False)
+ self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
+ if len(self.devices) > 1:
+ logger.warning(
+ f"Multiple devices are not supported in {self.__class__.__name__} inference, "
+ f"using the first device {self.devices[0]}."
+ )
Seq2SeqGenerator._register_converters(model_name_or_path, input_converter)
diff --git a/haystack/nodes/audio/_text_to_speech.py b/haystack/nodes/audio/_text_to_speech.py
index d884f4940c..4e8d721136 100644
--- a/haystack/nodes/audio/_text_to_speech.py
+++ b/haystack/nodes/audio/_text_to_speech.py
@@ -1,4 +1,4 @@
-from typing import Union, Callable, Any, Optional, Dict
+from typing import Union, Callable, Any, Optional, Dict, List
import os
import logging
@@ -6,6 +6,7 @@
from pathlib import Path
import numpy as np
+import torch
try:
import soundfile as sf
@@ -20,6 +21,8 @@
from haystack.errors import AudioNodeError
from haystack.modeling.utils import initialize_device_settings
+logger = logging.getLogger(__name__)
+
class TextToSpeech:
"""
@@ -33,17 +36,28 @@ def __init__(
model_name_or_path: Union[str, Path],
use_gpu: bool = True,
transformers_params: Optional[Dict[str, Any]] = None,
+ devices: Optional[List[Union[str, torch.device]]] = None,
):
"""
:param model_name_or_path: The text to speech model, for example `espnet/kan-bayashi_ljspeech_vits`.
:param use_gpu: Whether to use GPU (if available). Defaults to True.
:param transformers_params: Parameters to pass over to the `Text2Speech.from_pretrained()` call.
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
"""
super().__init__()
- devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False)
+ resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
+ if len(resolved_devices) > 1:
+ logger.warning(
+ f"Multiple devices are not supported in {self.__class__.__name__} inference, "
+ f"using the first device {resolved_devices[0]}."
+ )
+
self.model = _Text2SpeechModel.from_pretrained(
- model_name_or_path, device=devices[0].type, **(transformers_params or {})
+ model_name_or_path, device=resolved_devices[0].type, **(transformers_params or {})
)
def text_to_audio_file(
diff --git a/haystack/nodes/audio/answer_to_speech.py b/haystack/nodes/audio/answer_to_speech.py
index d24fae6c81..8b36241f5e 100644
--- a/haystack/nodes/audio/answer_to_speech.py
+++ b/haystack/nodes/audio/answer_to_speech.py
@@ -1,6 +1,8 @@
from typing import Union, Optional, List, Dict, Tuple, Any
from pathlib import Path
+
+import torch
from tqdm.auto import tqdm
from haystack.nodes import BaseComponent
@@ -23,6 +25,7 @@ def __init__(
audio_params: Optional[Dict[str, Any]] = None,
transformers_params: Optional[Dict[str, Any]] = None,
progress_bar: bool = True,
+ devices: Optional[List[Union[str, torch.device]]] = None,
):
"""
Convert an input Answer into an audio file containing the answer and its context read out loud.
@@ -49,9 +52,15 @@ def __init__(
By default, the audio file gets the name from the MD5 sum of the input text.
:param transformers_params: The parameters to pass over to the `Text2Speech.from_pretrained()` call.
:param progress_bar: Whether to show a progress bar while converting the text to audio.
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
"""
super().__init__()
- self.converter = TextToSpeech(model_name_or_path=model_name_or_path, transformers_params=transformers_params)
+ self.converter = TextToSpeech(
+ model_name_or_path=model_name_or_path, transformers_params=transformers_params, devices=devices
+ )
self.generated_audio_dir = generated_audio_dir
self.params: Dict[str, Any] = audio_params or {}
self.progress_bar = progress_bar
diff --git a/haystack/nodes/document_classifier/transformers.py b/haystack/nodes/document_classifier/transformers.py
index c10bfc49ac..3c76da2dcb 100644
--- a/haystack/nodes/document_classifier/transformers.py
+++ b/haystack/nodes/document_classifier/transformers.py
@@ -2,6 +2,7 @@
import logging
import itertools
+import torch
from tqdm.auto import tqdm
from transformers import pipeline
@@ -76,6 +77,7 @@ def __init__(
classification_field: str = None,
progress_bar: bool = True,
use_auth_token: Optional[Union[str, bool]] = None,
+ devices: Optional[List[Union[str, torch.device]]] = None,
):
"""
Load a text classification model from Transformers.
@@ -110,6 +112,10 @@ def __init__(
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
"""
super().__init__()
@@ -119,8 +125,12 @@ def __init__(
f"zero-shot-classification to use labels."
)
- devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False)
- device = 0 if devices[0].type == "cuda" else -1
+ resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
+ if len(resolved_devices) > 1:
+ logger.warning(
+ f"Multiple devices are not supported in {self.__class__.__name__} inference, "
+ f"using the first device {resolved_devices[0]}."
+ )
if tokenizer is None:
tokenizer = model_name_or_path
@@ -129,16 +139,16 @@ def __init__(
task=task,
model=model_name_or_path,
tokenizer=tokenizer,
- device=device,
revision=model_version,
use_auth_token=use_auth_token,
+ device=resolved_devices[0],
)
elif task == "text-classification":
self.model = pipeline(
task=task,
model=model_name_or_path,
tokenizer=tokenizer,
- device=device,
+ device=resolved_devices[0],
revision=model_version,
return_all_scores=return_all_scores,
use_auth_token=use_auth_token,
diff --git a/haystack/nodes/extractor/entity.py b/haystack/nodes/extractor/entity.py
index 81bcc9f21b..1eb0033e29 100644
--- a/haystack/nodes/extractor/entity.py
+++ b/haystack/nodes/extractor/entity.py
@@ -1,5 +1,7 @@
+import logging
from typing import List, Union, Dict, Optional, Tuple
import itertools
+import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
@@ -10,6 +12,8 @@
from haystack.modeling.utils import initialize_device_settings
from haystack.utils.torch_utils import ListDataset
+logger = logging.getLogger(__name__)
+
class EntityExtractor(BaseComponent):
"""
@@ -29,6 +33,10 @@ class EntityExtractor(BaseComponent):
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
"""
outgoing_edges = 1
@@ -40,10 +48,11 @@ def __init__(
batch_size: int = 16,
progress_bar: bool = True,
use_auth_token: Optional[Union[str, bool]] = None,
+ devices: Optional[List[Union[str, torch.device]]] = None,
):
super().__init__()
- self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False)
+ self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
self.batch_size = batch_size
self.progress_bar = progress_bar
@@ -57,9 +66,14 @@ def __init__(
model=token_classifier,
tokenizer=tokenizer,
aggregation_strategy="simple",
- device=0 if self.devices[0].type == "cuda" else -1,
+ device=self.devices[0],
use_auth_token=use_auth_token,
)
+ if len(self.devices) > 1:
+ logger.warning(
+ f"Multiple devices are not supported in {self.__class__.__name__} inference, "
+ f"using the first device {self.devices[0]}."
+ )
def run(self, documents: Optional[Union[List[Document], List[dict]]] = None) -> Tuple[Dict, str]: # type: ignore
"""
diff --git a/haystack/nodes/label_generator/pseudo_label_generator.py b/haystack/nodes/label_generator/pseudo_label_generator.py
index 8fa8f4d960..190414d140 100644
--- a/haystack/nodes/label_generator/pseudo_label_generator.py
+++ b/haystack/nodes/label_generator/pseudo_label_generator.py
@@ -1,13 +1,19 @@
+import logging
import random
from typing import Dict, Iterable, List, Optional, Tuple, Union
+import torch
from sentence_transformers import CrossEncoder
from tqdm.auto import tqdm
+
+from haystack.modeling.utils import initialize_device_settings
from haystack.nodes.base import BaseComponent
from haystack.nodes.question_generator import QuestionGenerator
from haystack.nodes.retriever.base import BaseRetriever
from haystack.schema import Document
+logger = logging.getLogger(__name__)
+
class PseudoLabelGenerator(BaseComponent):
"""
@@ -62,6 +68,8 @@ def __init__(
batch_size: int = 16,
progress_bar: bool = True,
use_auth_token: Optional[Union[str, bool]] = None,
+ use_gpu: bool = True,
+ devices: Optional[List[Union[str, torch.device]]] = None,
):
"""
Loads the cross-encoder model and prepares PseudoLabelGenerator.
@@ -88,6 +96,10 @@ def __init__(
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
:type use_auth_token: Union[str, bool] (optional)
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit CrossEncoder inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
"""
super().__init__()
@@ -105,10 +117,18 @@ def __init__(
)
else:
raise ValueError("Provide either a QuestionGenerator or a non-empty list of questions/document pairs.")
+ self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
+ if len(self.devices) > 1:
+ logger.warning(
+ f"Multiple devices are not supported in {self.__class__.__name__} inference, "
+ f"using the first device {self.devices[0]}."
+ )
self.retriever = retriever
+
self.cross_encoder = CrossEncoder(
cross_encoder_model_name_or_path,
+ device=str(self.devices[0]),
tokenizer_args={"use_auth_token": use_auth_token},
automodel_args={"use_auth_token": use_auth_token},
)
diff --git a/haystack/nodes/query_classifier/transformers.py b/haystack/nodes/query_classifier/transformers.py
index b834f40211..50cc7d4991 100644
--- a/haystack/nodes/query_classifier/transformers.py
+++ b/haystack/nodes/query_classifier/transformers.py
@@ -2,10 +2,10 @@
from pathlib import Path
from typing import Union, List, Optional, Dict, Any
+import torch
from transformers import pipeline
from tqdm.auto import tqdm
-# from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
from haystack.nodes.query_classifier.base import BaseQueryClassifier
from haystack.modeling.utils import initialize_device_settings
from haystack.utils.torch_utils import ListDataset
@@ -71,6 +71,7 @@ def __init__(
batch_size: int = 16,
progress_bar: bool = True,
use_auth_token: Optional[Union[str, bool]] = None,
+ devices: Optional[List[Union[str, torch.device]]] = None,
):
"""
:param model_name_or_path: Directory of a saved model or the name of a public model, for example 'shahrukhx01/bert-mini-finetune-question-detection'.
@@ -89,16 +90,25 @@ def __init__(
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
"""
super().__init__()
- devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False)
- device = 0 if devices[0].type == "cuda" else -1
+ resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
+ if len(resolved_devices) > 1:
+ logger.warning(
+ f"Multiple devices are not supported in {self.__class__.__name__} inference, "
+ f"using the first device {resolved_devices[0]}."
+ )
self.model = pipeline(
task=task,
model=model_name_or_path,
tokenizer=tokenizer,
- device=device,
+ device=resolved_devices[0],
revision=model_version,
use_auth_token=use_auth_token,
)
diff --git a/haystack/nodes/question_generator/question_generator.py b/haystack/nodes/question_generator/question_generator.py
index d77f573dd4..1704eca71e 100644
--- a/haystack/nodes/question_generator/question_generator.py
+++ b/haystack/nodes/question_generator/question_generator.py
@@ -1,5 +1,7 @@
+import logging
from typing import List, Union, Optional, Iterator
import itertools
+import torch
from tqdm.auto import tqdm
from transformers import AutoModelForSeq2SeqLM
@@ -11,6 +13,8 @@
from haystack.nodes.preprocessor import PreProcessor
from haystack.modeling.utils import initialize_device_settings
+logger = logging.getLogger(__name__)
+
class QuestionGenerator(BaseComponent):
"""
@@ -43,6 +47,7 @@ def __init__(
batch_size: int = 16,
progress_bar: bool = True,
use_auth_token: Optional[Union[str, bool]] = None,
+ devices: Optional[List[Union[str, torch.device]]] = None,
):
"""
Uses the valhalla/t5-base-e2e-qg model by default. This class supports any question generation model that is
@@ -61,9 +66,19 @@ def __init__(
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
+
"""
super().__init__()
- self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False)
+ self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
+ if len(self.devices) > 1:
+ logger.warning(
+ f"Multiple devices are not supported in {self.__class__.__name__} inference, "
+ f"using the first device {self.devices[0]}."
+ )
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, use_auth_token=use_auth_token)
self.model.to(str(self.devices[0]))
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_auth_token=use_auth_token)
diff --git a/haystack/nodes/ranker/sentence_transformers.py b/haystack/nodes/ranker/sentence_transformers.py
index 3c1a4ff48f..f86cd93b57 100644
--- a/haystack/nodes/ranker/sentence_transformers.py
+++ b/haystack/nodes/ranker/sentence_transformers.py
@@ -58,10 +58,6 @@ def __init__(
:param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
:param top_k: The maximum number of documents to return
:param use_gpu: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available.
- :param devices: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
- The strings will be converted into pytorch devices, so use the string notation described here:
- https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
- (e.g. ["cuda:0"]).
:param batch_size: Number of documents to process at a time.
:param scale_score: The raw predictions will be transformed using a Sigmoid activation function in case the model
only predicts a single label. For multi-label predictions, no scaling is applied. Set this
@@ -72,15 +68,17 @@ def __init__(
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
"""
super().__init__()
self.top_k = top_k
- if devices is not None:
- self.devices = [torch.device(device) for device in devices]
- else:
- self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=True)
+ self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=True)
+
self.progress_bar = progress_bar
self.transformer_model = AutoModelForSequenceClassification.from_pretrained(
pretrained_model_name_or_path=model_name_or_path, revision=model_version, use_auth_token=use_auth_token
diff --git a/haystack/nodes/reader/farm.py b/haystack/nodes/reader/farm.py
index fbdb7885f2..2d50cc2f1c 100644
--- a/haystack/nodes/reader/farm.py
+++ b/haystack/nodes/reader/farm.py
@@ -51,7 +51,7 @@ def __init__(
context_window_size: int = 150,
batch_size: int = 50,
use_gpu: bool = True,
- devices: List[torch.device] = [],
+ devices: Optional[List[Union[str, torch.device]]] = None,
no_ans_boost: float = 0.0,
return_no_answer: bool = False,
top_k: int = 10,
@@ -81,8 +81,10 @@ def __init__(
Memory consumption is much lower in inference mode. Recommendation: Increase the batch size
to a value so only a single batch is used.
:param use_gpu: Whether to use GPUs or the CPU. Falls back on CPU if no GPU is available.
- :param devices: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]).
- Unused if `use_gpu` is False.
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
:param no_ans_boost: How much the no_answer logit is boosted/increased.
If set to 0 (default), the no_answer logit is not changed.
If a negative number, there is a lower chance of "no_answer" being predicted.
@@ -382,8 +384,10 @@ def train(
:param dev_split: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here
that gets split off from training data for eval.
:param use_gpu: Whether to use GPU (if available)
- :param devices: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]).
- Unused if `use_gpu` is False.
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
:param batch_size: Number of samples the model receives in one batch for training
:param n_epochs: Number of iterations on the whole training data set
:param learning_rate: Learning rate of the optimizer
@@ -497,8 +501,10 @@ def distil_prediction_layer_from(
:param dev_split: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here
that gets split off from training data for eval.
:param use_gpu: Whether to use GPU (if available)
- :param devices: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]).
- Unused if `use_gpu` is False.
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
:param student_batch_size: Number of samples the student model receives in one batch for training
:param student_batch_size: Number of samples the teacher model receives in one batch for distillation
:param n_epochs: Number of iterations on the whole training data set
@@ -621,8 +627,10 @@ def distil_intermediate_layers_from(
:param dev_split: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here
that gets split off from training data for eval.
:param use_gpu: Whether to use GPU (if available)
- :param devices: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]).
- Unused if `use_gpu` is False.
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
:param student_batch_size: Number of samples the student model receives in one batch for training
:param student_batch_size: Number of samples the teacher model receives in one batch for distillation
:param n_epochs: Number of iterations on the whole training data set
diff --git a/haystack/nodes/reader/table.py b/haystack/nodes/reader/table.py
index 28c3d52fee..e20cc0bde9 100644
--- a/haystack/nodes/reader/table.py
+++ b/haystack/nodes/reader/table.py
@@ -73,6 +73,7 @@ def __init__(
return_no_answer: bool = False,
max_seq_len: int = 256,
use_auth_token: Optional[Union[str, bool]] = None,
+ devices: Optional[List[Union[str, torch.device]]] = None,
):
"""
Load a TableQA model from Transformers.
@@ -110,6 +111,10 @@ def __init__(
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
"""
if not torch_scatter_installed:
raise ImportError(
@@ -122,8 +127,14 @@ def __init__(
)
super().__init__()
- self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False)
+ self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
config = TapasConfig.from_pretrained(model_name_or_path, use_auth_token=use_auth_token)
+ if len(self.devices) > 1:
+ logger.warning(
+ f"Multiple devices are not supported in {self.__class__.__name__} inference, "
+ f"using the first device {self.devices[0]}."
+ )
+
if config.architectures[0] == "TapasForScoredQA":
self.model = self.TapasForScoredQA.from_pretrained(
model_name_or_path, revision=model_version, use_auth_token=use_auth_token
@@ -583,6 +594,12 @@ def __init__(
super().__init__()
self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False)
+ if len(self.devices) > 1:
+ logger.warning(
+ f"Multiple devices are not supported in {self.__class__.__name__} inference, "
+ f"using the first device {self.devices[0]}."
+ )
+
self.row_model = AutoModelForSequenceClassification.from_pretrained(
row_model_name_or_path, revision=row_model_version, use_auth_token=use_auth_token
)
diff --git a/haystack/nodes/reader/transformers.py b/haystack/nodes/reader/transformers.py
index 6d78a16b55..cc842be85b 100644
--- a/haystack/nodes/reader/transformers.py
+++ b/haystack/nodes/reader/transformers.py
@@ -3,6 +3,7 @@
import logging
import itertools
+import torch
from transformers import pipeline
from transformers.data.processors.squad import SquadExample
@@ -37,6 +38,7 @@ def __init__(
doc_stride: int = 128,
batch_size: int = 16,
use_auth_token: Optional[Union[str, bool]] = None,
+ devices: Optional[List[Union[str, torch.device]]] = None,
):
"""
Load a QA model from Transformers.
@@ -72,16 +74,27 @@ def __init__(
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
"""
super().__init__()
- self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False)
- device = 0 if self.devices[0].type == "cuda" else -1
+ self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
+
+ if len(self.devices) > 1:
+ logger.warning(
+ f"Multiple devices are not supported in {self.__class__.__name__} inference, "
+ f"using the first device {self.devices[0]}."
+ )
+
self.model = pipeline(
"question-answering",
model=model_name_or_path,
tokenizer=tokenizer,
- device=device,
+ device=self.devices[0],
revision=model_version,
use_auth_token=use_auth_token,
)
diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py
index 3009a0f927..3ae3829d42 100644
--- a/haystack/nodes/retriever/dense.py
+++ b/haystack/nodes/retriever/dense.py
@@ -113,10 +113,11 @@ def __init__(
Increase if errors like "encoded data exceeds max_size ..." come up
:param progress_bar: Whether to show a tqdm progress bar or not.
Can be helpful to disable in production deployments to keep the logs clean.
- :param devices: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
- These strings will be converted into pytorch devices, so use the string notation described here:
- https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
- (e.g. ["cuda:0"]). Note: as multi-GPU training is currently not implemented for DPR, training
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
+ Note: as multi-GPU training is currently not implemented for DPR, training
will only use the first device provided in this list.
:param use_auth_token: The API token used to download private models from Huggingface.
If this parameter is set to `True`, then the token generated when running
@@ -129,13 +130,10 @@ def __init__(
"""
super().__init__()
- if devices is not None:
- self.devices = [torch.device(device) for device in devices]
- else:
- self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=True)
+ self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=True)
if batch_size < len(self.devices):
- logger.warning("Batch size is less than the number of devices. All gpus will not be utilized.")
+ logger.warning("Batch size is less than the number of devices.All gpus will not be utilized.")
self.document_store = document_store
self.batch_size = batch_size
@@ -820,10 +818,11 @@ def __init__(
Increase if errors like "encoded data exceeds max_size ..." come up
:param progress_bar: Whether to show a tqdm progress bar or not.
Can be helpful to disable in production deployments to keep the logs clean.
- :param devices: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
- These strings will be converted into pytorch devices, so use the string notation described here:
- https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
- (e.g. ["cuda:0"]). Note: as multi-GPU training is currently not implemented for TableTextRetriever,
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
+ Note: as multi-GPU training is currently not implemented for TableTextRetriever,
training will only use the first device provided in this list.
:param use_auth_token: The API token used to download private models from Huggingface.
If this parameter is set to `True`, then the token generated when running
@@ -837,13 +836,10 @@ def __init__(
"""
super().__init__()
- if devices is not None:
- self.devices = [torch.device(device) for device in devices]
- else:
- self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=True)
+ self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=True)
if batch_size < len(self.devices):
- logger.warning("Batch size is less than the number of devices. All gpus will not be utilized.")
+ logger.warning("Batch size is less than the number of devices.All gpus will not be utilized.")
self.document_store = document_store
self.batch_size = batch_size
@@ -1489,10 +1485,11 @@ def __init__(
Default: -1 (very last layer).
:param top_k: How many documents to return per query.
:param progress_bar: If true displays progress bar during embedding.
- :param devices: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
- These strings will be converted into pytorch devices, so use the string notation described here:
- https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
- (e.g. ["cuda:0"]). Note: As multi-GPU training is currently not implemented for EmbeddingRetriever,
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
+ Note: As multi-GPU training is currently not implemented for EmbeddingRetriever,
training will only use the first device provided in this list.
:param use_auth_token: The API token used to download private models from Huggingface.
If this parameter is set to `True`, then the token generated when running
@@ -1510,13 +1507,10 @@ def __init__(
"""
super().__init__()
- if devices is not None:
- self.devices = [torch.device(device) for device in devices]
- else:
- self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=True)
+ self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=True)
if batch_size < len(self.devices):
- logger.warning("Batch size is less than the number of devices. All gpus will not be utilized.")
+ logger.warning("Batch size is less than the number of devices.All gpus will not be utilized.")
self.document_store = document_store
self.embedding_model = embedding_model
@@ -1965,10 +1959,11 @@ def __init__(
Default: -1 (very last layer).
:param top_k: How many documents to return per query.
:param progress_bar: If true displays progress bar during embedding.
- :param devices: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones
- These strings will be converted into pytorch devices, so use the string notation described here:
- https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device
- (e.g. ["cuda:0"]). Note: As multi-GPU training is currently not implemented for EmbeddingRetriever,
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
+ Note: As multi-GPU training is currently not implemented for EmbeddingRetriever,
training will only use the first device provided in this list.
:param use_auth_token: The API token used to download private models from Huggingface.
If this parameter is set to `True`, then the token generated when running
diff --git a/haystack/nodes/summarizer/transformers.py b/haystack/nodes/summarizer/transformers.py
index 8e79ef34bb..9fc3d8068d 100644
--- a/haystack/nodes/summarizer/transformers.py
+++ b/haystack/nodes/summarizer/transformers.py
@@ -3,6 +3,7 @@
import logging
+import torch
from tqdm.auto import tqdm
from transformers import pipeline
from transformers.models.auto.modeling_auto import AutoModelForSeq2SeqLM
@@ -66,6 +67,7 @@ def __init__(
batch_size: int = 16,
progress_bar: bool = True,
use_auth_token: Optional[Union[str, bool]] = None,
+ devices: Optional[List[Union[str, torch.device]]] = None,
):
"""
Load a Summarization model from Transformers.
@@ -94,11 +96,20 @@ def __init__(
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
"""
super().__init__()
- self.devices, _ = initialize_device_settings(use_cuda=use_gpu)
- device = 0 if self.devices[0].type == "cuda" else -1
+ self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
+ if len(self.devices) > 1:
+ logger.warning(
+ f"Multiple devices are not supported in {self.__class__.__name__} inference, "
+ f"using the first device {self.devices[0]}."
+ )
+
# TODO AutoModelForSeq2SeqLM is only necessary with transformers==4.1.1, with newer versions use the pipeline directly
if tokenizer is None:
tokenizer = model_name_or_path
@@ -106,7 +117,7 @@ def __init__(
pretrained_model_name_or_path=model_name_or_path, revision=model_version, use_auth_token=use_auth_token
)
self.summarizer = pipeline(
- "summarization", model=model, tokenizer=tokenizer, device=device, use_auth_token=use_auth_token
+ "summarization", model=model, tokenizer=tokenizer, device=self.devices[0], use_auth_token=use_auth_token
)
self.max_length = max_length
self.min_length = min_length
diff --git a/haystack/nodes/translator/transformers.py b/haystack/nodes/translator/transformers.py
index 464c859a9b..69a9f3aaa8 100644
--- a/haystack/nodes/translator/transformers.py
+++ b/haystack/nodes/translator/transformers.py
@@ -2,6 +2,7 @@
from copy import deepcopy
from typing import Any, Dict, List, Optional, Union
+import torch
from tqdm.auto import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer # type: ignore
@@ -44,6 +45,7 @@ def __init__(
use_gpu: bool = True,
progress_bar: bool = True,
use_auth_token: Optional[Union[str, bool]] = None,
+ devices: Optional[List[Union[str, torch.device]]] = None,
):
"""Initialize the translator with a model that fits your targeted languages. While we support all seq2seq
models from Hugging Face's model hub, we recommend using the OPUS models from Helsinki NLP. They provide plenty
@@ -70,10 +72,21 @@ def __init__(
`transformers-cli login` (stored in ~/.huggingface) will be used.
Additional information can be found here
https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+
+ :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+ A list containing torch device objects and/or strings is supported (For example
+ [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+ parameter is not used and a single cpu device is used for inference.
"""
super().__init__()
- self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False)
+ self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
+ if len(self.devices) > 1:
+ logger.warning(
+ f"Multiple devices are not supported in {self.__class__.__name__} inference, "
+ f"using the first device {self.devices[0]}."
+ )
+
self.max_seq_len = max_seq_len
self.clean_up_tokenization_spaces = clean_up_tokenization_spaces
self.progress_bar = progress_bar