diff --git a/docs/_src/api/api/document_classifier.md b/docs/_src/api/api/document_classifier.md index 375c3baa2d..629bbb5684 100644 --- a/docs/_src/api/api/document_classifier.md +++ b/docs/_src/api/api/document_classifier.md @@ -84,7 +84,7 @@ With this document_classifier, you can directly get predictions via predict() #### TransformersDocumentClassifier.\_\_init\_\_ ```python -def __init__(model_name_or_path: str = "bhadresh-savani/distilbert-base-uncased-emotion", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, return_all_scores: bool = False, task: str = "text-classification", labels: Optional[List[str]] = None, batch_size: int = 16, classification_field: str = None, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None) +def __init__(model_name_or_path: str = "bhadresh-savani/distilbert-base-uncased-emotion", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, return_all_scores: bool = False, task: str = "text-classification", labels: Optional[List[str]] = None, batch_size: int = 16, classification_field: str = None, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None) ``` Load a text classification model from Transformers. @@ -122,6 +122,10 @@ If this parameter is set to `True`, then the token generated when running `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index 2fde44aef6..53aa056f78 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -1652,7 +1652,7 @@ In-memory document store #### InMemoryDocumentStore.\_\_init\_\_ ```python -def __init__(index: str = "document", label_index: str = "label", embedding_field: Optional[str] = "embedding", embedding_dim: int = 768, return_embedding: bool = False, similarity: str = "dot_product", progress_bar: bool = True, duplicate_documents: str = "overwrite", use_gpu: bool = True, scoring_batch_size: int = 500000) +def __init__(index: str = "document", label_index: str = "label", embedding_field: Optional[str] = "embedding", embedding_dim: int = 768, return_embedding: bool = False, similarity: str = "dot_product", progress_bar: bool = True, duplicate_documents: str = "overwrite", use_gpu: bool = True, scoring_batch_size: int = 500000, devices: Optional[List[Union[str, torch.device]]] = None) ``` **Arguments**: @@ -1680,6 +1680,10 @@ Very large batch sizes can overrun GPU memory. In general you want to make sure you have at least `embedding_dim`*`scoring_batch_size`*4 bytes available in GPU memory. Since the data is originally stored in CPU memory there is little risk of overruning memory when running on CPU. +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. diff --git a/docs/_src/api/api/extractor.md b/docs/_src/api/api/extractor.md index 67dc56b4ba..339fa8069c 100644 --- a/docs/_src/api/api/extractor.md +++ b/docs/_src/api/api/extractor.md @@ -29,6 +29,10 @@ If this parameter is set to `True`, then the token generated when running `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. diff --git a/docs/_src/api/api/generator.md b/docs/_src/api/api/generator.md index 0ff3cf4f15..58fbbd2b9f 100644 --- a/docs/_src/api/api/generator.md +++ b/docs/_src/api/api/generator.md @@ -138,7 +138,7 @@ i.e. the model can easily adjust to domain documents even after training has fin #### RAGenerator.\_\_init\_\_ ```python -def __init__(model_name_or_path: str = "facebook/rag-token-nq", model_version: Optional[str] = None, retriever: Optional[DensePassageRetriever] = None, generator_type: str = "token", top_k: int = 2, max_length: int = 200, min_length: int = 2, num_beams: int = 2, embed_title: bool = True, prefix: Optional[str] = None, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None) +def __init__(model_name_or_path: str = "facebook/rag-token-nq", model_version: Optional[str] = None, retriever: Optional[DensePassageRetriever] = None, generator_type: str = "token", top_k: int = 2, max_length: int = 200, min_length: int = 2, num_beams: int = 2, embed_title: bool = True, prefix: Optional[str] = None, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None) ``` Load a RAG model from Transformers along with passage_embedding_model. @@ -166,6 +166,10 @@ If this parameter is set to `True`, then the token generated when running `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. @@ -262,7 +266,7 @@ the [Hugging Face Model Hub](https://huggingface.co/models?pipeline_tag=text2tex #### Seq2SeqGenerator.\_\_init\_\_ ```python -def __init__(model_name_or_path: str, input_converter: Optional[Callable] = None, top_k: int = 1, max_length: int = 200, min_length: int = 2, num_beams: int = 8, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None) +def __init__(model_name_or_path: str, input_converter: Optional[Callable] = None, top_k: int = 1, max_length: int = 200, min_length: int = 2, num_beams: int = 8, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None) ``` **Arguments**: @@ -284,6 +288,10 @@ If this parameter is set to `True`, then the token generated when running `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. diff --git a/docs/_src/api/api/pseudo_label_generator.md b/docs/_src/api/api/pseudo_label_generator.md index d8fa9a4c19..757f14b5f7 100644 --- a/docs/_src/api/api/pseudo_label_generator.md +++ b/docs/_src/api/api/pseudo_label_generator.md @@ -53,7 +53,7 @@ For example: #### PseudoLabelGenerator.\_\_init\_\_ ```python -def __init__(question_producer: Union[QuestionGenerator, List[Dict[str, str]]], retriever: BaseRetriever, cross_encoder_model_name_or_path: str = "cross-encoder/ms-marco-MiniLM-L-6-v2", max_questions_per_document: int = 3, top_k: int = 50, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None) +def __init__(question_producer: Union[QuestionGenerator, List[Dict[str, str]]], retriever: BaseRetriever, cross_encoder_model_name_or_path: str = "cross-encoder/ms-marco-MiniLM-L-6-v2", max_questions_per_document: int = 3, top_k: int = 50, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None) ``` Loads the cross-encoder model and prepares PseudoLabelGenerator. @@ -74,6 +74,10 @@ If this parameter is set to `True`, then the token generated when running `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit CrossEncoder inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. diff --git a/docs/_src/api/api/query_classifier.md b/docs/_src/api/api/query_classifier.md index 81b89c373e..45df8f2c2a 100644 --- a/docs/_src/api/api/query_classifier.md +++ b/docs/_src/api/api/query_classifier.md @@ -144,7 +144,7 @@ This node also supports zero-shot-classification. #### TransformersQueryClassifier.\_\_init\_\_ ```python -def __init__(model_name_or_path: Union[Path, str] = "shahrukhx01/bert-mini-finetune-question-detection", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, task: str = "text-classification", labels: List[str] = DEFAULT_LABELS, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None) +def __init__(model_name_or_path: Union[Path, str] = "shahrukhx01/bert-mini-finetune-question-detection", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, task: str = "text-classification", labels: List[str] = DEFAULT_LABELS, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None) ``` **Arguments**: @@ -165,4 +165,8 @@ If this parameter is set to `True`, then the token generated when running `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. diff --git a/docs/_src/api/api/question_generator.md b/docs/_src/api/api/question_generator.md index e8ab9f4a4d..c5bfc32363 100644 --- a/docs/_src/api/api/question_generator.md +++ b/docs/_src/api/api/question_generator.md @@ -23,7 +23,7 @@ come from earlier in the document. #### QuestionGenerator.\_\_init\_\_ ```python -def __init__(model_name_or_path="valhalla/t5-base-e2e-qg", model_version=None, num_beams=4, max_length=256, no_repeat_ngram_size=3, length_penalty=1.5, early_stopping=True, split_length=50, split_overlap=10, use_gpu=True, prompt="generate questions:", num_queries_per_doc=1, sep_token: str = "", batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None) +def __init__(model_name_or_path="valhalla/t5-base-e2e-qg", model_version=None, num_beams=4, max_length=256, no_repeat_ngram_size=3, length_penalty=1.5, early_stopping=True, split_length=50, split_overlap=10, use_gpu=True, prompt="generate questions:", num_queries_per_doc=1, sep_token: str = "", batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None) ``` Uses the valhalla/t5-base-e2e-qg model by default. This class supports any question generation model that is @@ -45,6 +45,10 @@ If this parameter is set to `True`, then the token generated when running `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. diff --git a/docs/_src/api/api/ranker.md b/docs/_src/api/api/ranker.md index 912e2fc7b3..41e1788b08 100644 --- a/docs/_src/api/api/ranker.md +++ b/docs/_src/api/api/ranker.md @@ -105,10 +105,6 @@ See https://huggingface.co/cross-encoder for full list of available models - `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. - `top_k`: The maximum number of documents to return - `use_gpu`: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available. -- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones -The strings will be converted into pytorch devices, so use the string notation described here: -https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device -(e.g. ["cuda:0"]). - `batch_size`: Number of documents to process at a time. - `scale_score`: The raw predictions will be transformed using a Sigmoid activation function in case the model only predicts a single label. For multi-label predictions, no scaling is applied. Set this @@ -119,6 +115,10 @@ If this parameter is set to `True`, then the token generated when running `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. diff --git a/docs/_src/api/api/reader.md b/docs/_src/api/api/reader.md index b163a9e45a..19cfc35189 100644 --- a/docs/_src/api/api/reader.md +++ b/docs/_src/api/api/reader.md @@ -45,7 +45,7 @@ While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interf #### FARMReader.\_\_init\_\_ ```python -def __init__(model_name_or_path: str, model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, devices: List[torch.device] = [], no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0, use_confidence_scores: bool = True, confidence_threshold: Optional[float] = None, proxies: Optional[Dict[str, str]] = None, local_files_only=False, force_download=False, use_auth_token: Optional[Union[str, bool]] = None) +def __init__(model_name_or_path: str, model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0, use_confidence_scores: bool = True, confidence_threshold: Optional[float] = None, proxies: Optional[Dict[str, str]] = None, local_files_only=False, force_download=False, use_auth_token: Optional[Union[str, bool]] = None) ``` **Arguments**: @@ -60,8 +60,10 @@ displaying the context around the answer. Memory consumption is much lower in inference mode. Recommendation: Increase the batch size to a value so only a single batch is used. - `use_gpu`: Whether to use GPUs or the CPU. Falls back on CPU if no GPU is available. -- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]). -Unused if `use_gpu` is False. +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. - `no_ans_boost`: How much the no_answer logit is boosted/increased. If set to 0 (default), the no_answer logit is not changed. If a negative number, there is a lower chance of "no_answer" being predicted. @@ -131,8 +133,10 @@ If any checkpoints are stored, a subsequent run of train() will resume training - `dev_split`: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here that gets split off from training data for eval. - `use_gpu`: Whether to use GPU (if available) -- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]). -Unused if `use_gpu` is False. +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. - `batch_size`: Number of samples the model receives in one batch for training - `n_epochs`: Number of iterations on the whole training data set - `learning_rate`: Learning rate of the optimizer @@ -202,8 +206,10 @@ If any checkpoints are stored, a subsequent run of train() will resume training - `dev_split`: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here that gets split off from training data for eval. - `use_gpu`: Whether to use GPU (if available) -- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]). -Unused if `use_gpu` is False. +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. - `student_batch_size`: Number of samples the student model receives in one batch for training - `student_batch_size`: Number of samples the teacher model receives in one batch for distillation - `n_epochs`: Number of iterations on the whole training data set @@ -278,8 +284,10 @@ If any checkpoints are stored, a subsequent run of train() will resume training - `dev_split`: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here that gets split off from training data for eval. - `use_gpu`: Whether to use GPU (if available) -- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]). -Unused if `use_gpu` is False. +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. - `student_batch_size`: Number of samples the student model receives in one batch for training - `student_batch_size`: Number of samples the teacher model receives in one batch for distillation - `n_epochs`: Number of iterations on the whole training data set @@ -589,7 +597,7 @@ With this reader, you can directly get predictions via predict() #### TransformersReader.\_\_init\_\_ ```python -def __init__(model_name_or_path: str = "distilbert-base-uncased-distilled-squad", model_version: Optional[str] = None, tokenizer: Optional[str] = None, context_window_size: int = 70, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 3, return_no_answers: bool = False, max_seq_len: int = 256, doc_stride: int = 128, batch_size: int = 16, use_auth_token: Optional[Union[str, bool]] = None) +def __init__(model_name_or_path: str = "distilbert-base-uncased-distilled-squad", model_version: Optional[str] = None, tokenizer: Optional[str] = None, context_window_size: int = 70, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 3, return_no_answers: bool = False, max_seq_len: int = 256, doc_stride: int = 128, batch_size: int = 16, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None) ``` Load a QA model from Transformers. @@ -628,6 +636,10 @@ If this parameter is set to `True`, then the token generated when running `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. @@ -739,7 +751,7 @@ answer = prediction["answers"][0].answer # "10 june 1996" #### TableReader.\_\_init\_\_ ```python -def __init__(model_name_or_path: str = "google/tapas-base-finetuned-wtq", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 3, return_no_answer: bool = False, max_seq_len: int = 256, use_auth_token: Optional[Union[str, bool]] = None) +def __init__(model_name_or_path: str = "google/tapas-base-finetuned-wtq", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 3, return_no_answer: bool = False, max_seq_len: int = 256, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None) ``` Load a TableQA model from Transformers. @@ -780,6 +792,10 @@ If this parameter is set to `True`, then the token generated when running `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md index 85334930b9..318e35940d 100644 --- a/docs/_src/api/api/retriever.md +++ b/docs/_src/api/api/retriever.md @@ -567,10 +567,11 @@ Options: `dot_product` (Default) or `cosine` Increase if errors like "encoded data exceeds max_size ..." come up - `progress_bar`: Whether to show a tqdm progress bar or not. Can be helpful to disable in production deployments to keep the logs clean. -- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones -These strings will be converted into pytorch devices, so use the string notation described here: -https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device -(e.g. ["cuda:0"]). Note: as multi-GPU training is currently not implemented for DPR, training +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. +Note: as multi-GPU training is currently not implemented for DPR, training will only use the first device provided in this list. - `use_auth_token`: The API token used to download private models from Huggingface. If this parameter is set to `True`, then the token generated when running @@ -934,10 +935,11 @@ Options: `dot_product` (Default) or `cosine` Increase if errors like "encoded data exceeds max_size ..." come up - `progress_bar`: Whether to show a tqdm progress bar or not. Can be helpful to disable in production deployments to keep the logs clean. -- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones -These strings will be converted into pytorch devices, so use the string notation described here: -https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device -(e.g. ["cuda:0"]). Note: as multi-GPU training is currently not implemented for TableTextRetriever, +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. +Note: as multi-GPU training is currently not implemented for TableTextRetriever, training will only use the first device provided in this list. - `use_auth_token`: The API token used to download private models from Huggingface. If this parameter is set to `True`, then the token generated when running @@ -1212,10 +1214,11 @@ Options: Default: -1 (very last layer). - `top_k`: How many documents to return per query. - `progress_bar`: If true displays progress bar during embedding. -- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones -These strings will be converted into pytorch devices, so use the string notation described here: -https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device -(e.g. ["cuda:0"]). Note: As multi-GPU training is currently not implemented for EmbeddingRetriever, +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. +Note: As multi-GPU training is currently not implemented for EmbeddingRetriever, training will only use the first device provided in this list. - `use_auth_token`: The API token used to download private models from Huggingface. If this parameter is set to `True`, then the token generated when running @@ -1535,10 +1538,11 @@ Options: Default: -1 (very last layer). - `top_k`: How many documents to return per query. - `progress_bar`: If true displays progress bar during embedding. -- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones -These strings will be converted into pytorch devices, so use the string notation described here: -https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device -(e.g. ["cuda:0"]). Note: As multi-GPU training is currently not implemented for EmbeddingRetriever, +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. +Note: As multi-GPU training is currently not implemented for EmbeddingRetriever, training will only use the first device provided in this list. - `use_auth_token`: The API token used to download private models from Huggingface. If this parameter is set to `True`, then the token generated when running diff --git a/docs/_src/api/api/summarizer.md b/docs/_src/api/api/summarizer.md index f5a5a23c5e..d76878f788 100644 --- a/docs/_src/api/api/summarizer.md +++ b/docs/_src/api/api/summarizer.md @@ -87,7 +87,7 @@ See the up-to-date list of available models on #### TransformersSummarizer.\_\_init\_\_ ```python -def __init__(model_name_or_path: str = "google/pegasus-xsum", model_version: Optional[str] = None, tokenizer: Optional[str] = None, max_length: int = 200, min_length: int = 5, use_gpu: bool = True, clean_up_tokenization_spaces: bool = True, separator_for_single_summary: str = " ", generate_single_summary: bool = False, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None) +def __init__(model_name_or_path: str = "google/pegasus-xsum", model_version: Optional[str] = None, tokenizer: Optional[str] = None, max_length: int = 200, min_length: int = 5, use_gpu: bool = True, clean_up_tokenization_spaces: bool = True, separator_for_single_summary: str = " ", generate_single_summary: bool = False, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None) ``` Load a Summarization model from Transformers. @@ -119,6 +119,10 @@ If this parameter is set to `True`, then the token generated when running `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. diff --git a/docs/_src/api/api/translator.md b/docs/_src/api/api/translator.md index f93d961e2b..8f2ddc66a7 100644 --- a/docs/_src/api/api/translator.md +++ b/docs/_src/api/api/translator.md @@ -68,7 +68,7 @@ We currently recommend using OPUS models (see __init__() for details) #### TransformersTranslator.\_\_init\_\_ ```python -def __init__(model_name_or_path: str, tokenizer_name: Optional[str] = None, max_seq_len: Optional[int] = None, clean_up_tokenization_spaces: Optional[bool] = True, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None) +def __init__(model_name_or_path: str, tokenizer_name: Optional[str] = None, max_seq_len: Optional[int] = None, clean_up_tokenization_spaces: Optional[bool] = True, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None) ``` Initialize the translator with a model that fits your targeted languages. While we support all seq2seq @@ -99,6 +99,10 @@ If this parameter is set to `True`, then the token generated when running `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained +- `devices`: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. +A list containing torch device objects and/or strings is supported (For example +[torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices +parameter is not used and a single cpu device is used for inference. diff --git a/haystack/document_stores/memory.py b/haystack/document_stores/memory.py index 760df00ccc..e18b2d88e7 100644 --- a/haystack/document_stores/memory.py +++ b/haystack/document_stores/memory.py @@ -39,6 +39,7 @@ def __init__( duplicate_documents: str = "overwrite", use_gpu: bool = True, scoring_batch_size: int = 500000, + devices: Optional[List[Union[str, torch.device]]] = None, ): """ :param index: The documents are scoped to an index attribute that can be used when writing, querying, @@ -64,6 +65,10 @@ def __init__( you have at least `embedding_dim`*`scoring_batch_size`*4 bytes available in GPU memory. Since the data is originally stored in CPU memory there is little risk of overruning memory when running on CPU. + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. """ super().__init__() @@ -79,7 +84,13 @@ def __init__( self.use_gpu = use_gpu self.scoring_batch_size = scoring_batch_size - self.devices, _ = initialize_device_settings(use_cuda=self.use_gpu) + self.devices, _ = initialize_device_settings(devices=devices, use_cuda=self.use_gpu, multi_gpu=False) + if len(self.devices) > 1: + logger.warning( + f"Multiple devices are not supported in {self.__class__.__name__} inference, " + f"using the first device {self.devices[0]}." + ) + self.main_device = self.devices[0] def write_documents( diff --git a/haystack/json-schemas/haystack-pipeline-main.schema.json b/haystack/json-schemas/haystack-pipeline-main.schema.json index dd2e76d6bc..a6d96d5f75 100644 --- a/haystack/json-schemas/haystack-pipeline-main.schema.json +++ b/haystack/json-schemas/haystack-pipeline-main.schema.json @@ -960,6 +960,27 @@ "title": "Scoring Batch Size", "default": 500000, "type": "integer" + }, + "devices": { + "title": "Devices", + "anyOf": [ + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + { + "type": "null" + } + ] } }, "additionalProperties": false, @@ -2056,6 +2077,27 @@ "title": "Progress Bar", "default": true, "type": "boolean" + }, + "devices": { + "title": "Devices", + "anyOf": [ + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + { + "type": "null" + } + ] } }, "additionalProperties": false, @@ -3000,6 +3042,27 @@ "type": "null" } ] + }, + "devices": { + "title": "Devices", + "anyOf": [ + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + { + "type": "null" + } + ] } }, "additionalProperties": false, @@ -3154,11 +3217,24 @@ }, "devices": { "title": "Devices", - "default": [], - "type": "array", - "items": { - "type": "string" - } + "anyOf": [ + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + { + "type": "null" + } + ] }, "no_ans_boost": { "title": "No Ans Boost", @@ -4340,6 +4416,32 @@ "type": "null" } ] + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "devices": { + "title": "Devices", + "anyOf": [ + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + { + "type": "null" + } + ] } }, "required": [ @@ -4449,6 +4551,27 @@ "type": "null" } ] + }, + "devices": { + "title": "Devices", + "anyOf": [ + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + { + "type": "null" + } + ] } }, "additionalProperties": false, @@ -4571,6 +4694,27 @@ "type": "null" } ] + }, + "devices": { + "title": "Devices", + "anyOf": [ + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + { + "type": "null" + } + ] } }, "additionalProperties": false, @@ -4935,6 +5079,27 @@ "type": "null" } ] + }, + "devices": { + "title": "Devices", + "anyOf": [ + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + { + "type": "null" + } + ] } }, "required": [ @@ -5098,6 +5263,27 @@ "type": "null" } ] + }, + "devices": { + "title": "Devices", + "anyOf": [ + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + { + "type": "null" + } + ] } }, "additionalProperties": false, @@ -5641,6 +5827,27 @@ "type": "null" } ] + }, + "devices": { + "title": "Devices", + "anyOf": [ + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + { + "type": "null" + } + ] } }, "additionalProperties": false, @@ -5750,6 +5957,27 @@ "type": "null" } ] + }, + "devices": { + "title": "Devices", + "anyOf": [ + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + { + "type": "null" + } + ] } }, "additionalProperties": false, @@ -5860,6 +6088,27 @@ "type": "null" } ] + }, + "devices": { + "title": "Devices", + "anyOf": [ + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + { + "type": "null" + } + ] } }, "additionalProperties": false, @@ -5970,6 +6219,27 @@ "type": "null" } ] + }, + "devices": { + "title": "Devices", + "anyOf": [ + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + { + "type": "null" + } + ] } }, "additionalProperties": false, @@ -6061,6 +6331,27 @@ "type": "null" } ] + }, + "devices": { + "title": "Devices", + "anyOf": [ + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + { + "type": "null" + } + ] } }, "required": [ diff --git a/haystack/modeling/infer.py b/haystack/modeling/infer.py index 44c766081d..4542b49462 100644 --- a/haystack/modeling/infer.py +++ b/haystack/modeling/infer.py @@ -46,6 +46,7 @@ def __init__( extraction_layer: Optional[int] = None, num_processes: Optional[int] = None, disable_tqdm: bool = False, + devices: Optional[List[Union[str, torch.device]]] = None, ): """ Initializes Inferencer from an AdaptiveModel and a Processor instance. @@ -70,11 +71,20 @@ def __init__( :func:`~farm.infer.Inferencer.close_multiprocessing_pool` after you are done using this class. The garbage collector will not do this for you! :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing) + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. :return: An instance of the Inferencer. """ # Init device and distributed settings - self.devices, n_gpu = initialize_device_settings(use_cuda=gpu, multi_gpu=False) + self.devices, n_gpu = initialize_device_settings(devices=devices, use_cuda=gpu, multi_gpu=False) + if len(self.devices) > 1: + logger.warning( + f"Multiple devices are not supported in {self.__class__.__name__} inference, " + f"using the first device {self.devices[0]}." + ) self.processor = processor self.model = model @@ -125,8 +135,8 @@ def load( use_fast: bool = True, tokenizer_args: Dict = None, multithreading_rust: bool = True, - devices: Optional[List[torch.device]] = None, use_auth_token: Optional[Union[bool, str]] = None, + devices: Optional[List[Union[str, torch.device]]] = None, **kwargs, ): """ @@ -177,8 +187,11 @@ def load( if tokenizer_args is None: tokenizer_args = {} - if devices is None: - devices, n_gpu = initialize_device_settings(use_cuda=gpu, multi_gpu=False) + devices, n_gpu = initialize_device_settings(devices=devices, use_cuda=gpu, multi_gpu=False) + if len(devices) > 1: + logger.warning( + f"Multiple devices are not supported in Inferencer, " f"using the first device {devices[0]}." + ) name = os.path.basename(model_name_or_path) @@ -243,6 +256,7 @@ def load( extraction_layer=extraction_layer, num_processes=num_processes, disable_tqdm=disable_tqdm, + devices=devices, ) def _set_multiprocessing_pool(self, num_processes: Optional[int]) -> None: diff --git a/haystack/modeling/utils.py b/haystack/modeling/utils.py index 998ab573cf..35d17d08a4 100644 --- a/haystack/modeling/utils.py +++ b/haystack/modeling/utils.py @@ -1,4 +1,4 @@ -from typing import Tuple, List, Optional +from typing import Tuple, List, Optional, Union import logging import os @@ -52,7 +52,7 @@ def initialize_device_settings( use_cuda: Optional[bool] = None, local_rank: int = -1, multi_gpu: bool = True, - devices: Optional[List[torch.device]] = None, + devices: List[Union[str, torch.device]] = None, ) -> Tuple[List[torch.device], int]: """ Returns a list of available devices. @@ -62,14 +62,23 @@ def initialize_device_settings( Unused if `devices` is set or `use_cuda` is False. :param multi_gpu: Whether to make use of all GPUs (if available). Unused if `devices` is set or `use_cuda` is False. - :param devices: an explicit list of which GPUs to use. Unused if `use_cuda` is False. + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. """ if use_cuda is False: # Note that it could be None, in which case we also want to just skip this step. devices_to_use = [torch.device("cpu")] n_gpu = 0 elif devices: - devices_to_use = devices - n_gpu = sum(1 for device in devices if "cpu" not in device.type) + if not isinstance(devices, list): + raise ValueError(f"devices must be a list, but got {devices} of type {type(devices)}") + if any(isinstance(device, str) for device in devices): + torch_devices: List[torch.device] = [torch.device(device) for device in devices] + devices_to_use = torch_devices + else: + devices_to_use = devices + n_gpu = sum(1 for device in devices_to_use if "cpu" not in device.type) elif local_rank == -1: if torch.cuda.is_available(): if multi_gpu: diff --git a/haystack/nodes/answer_generator/transformers.py b/haystack/nodes/answer_generator/transformers.py index 5387c058ba..54293ce789 100644 --- a/haystack/nodes/answer_generator/transformers.py +++ b/haystack/nodes/answer_generator/transformers.py @@ -80,6 +80,7 @@ def __init__( use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None, ): """ Load a RAG model from Transformers along with passage_embedding_model. @@ -104,6 +105,11 @@ def __init__( `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. """ super().__init__(progress_bar=progress_bar) @@ -122,7 +128,12 @@ def __init__( self.top_k = top_k - self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) + self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False) + if len(self.devices) > 1: + logger.warning( + f"Multiple devices are not supported in {self.__class__.__name__} inference, " + f"using the first device {self.devices[0]}." + ) self.tokenizer = RagTokenizer.from_pretrained(model_name_or_path, use_auth_token=use_auth_token) @@ -338,6 +349,7 @@ def __init__( use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None, ): """ :param model_name_or_path: a HF model name for auto-regressive language model like GPT2, XLNet, XLM, Bart, T5 etc @@ -357,6 +369,10 @@ def __init__( `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. """ super().__init__(progress_bar=progress_bar) self.model_name_or_path = model_name_or_path @@ -370,7 +386,12 @@ def __init__( self.top_k = top_k - self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) + self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False) + if len(self.devices) > 1: + logger.warning( + f"Multiple devices are not supported in {self.__class__.__name__} inference, " + f"using the first device {self.devices[0]}." + ) Seq2SeqGenerator._register_converters(model_name_or_path, input_converter) diff --git a/haystack/nodes/audio/_text_to_speech.py b/haystack/nodes/audio/_text_to_speech.py index d884f4940c..4e8d721136 100644 --- a/haystack/nodes/audio/_text_to_speech.py +++ b/haystack/nodes/audio/_text_to_speech.py @@ -1,4 +1,4 @@ -from typing import Union, Callable, Any, Optional, Dict +from typing import Union, Callable, Any, Optional, Dict, List import os import logging @@ -6,6 +6,7 @@ from pathlib import Path import numpy as np +import torch try: import soundfile as sf @@ -20,6 +21,8 @@ from haystack.errors import AudioNodeError from haystack.modeling.utils import initialize_device_settings +logger = logging.getLogger(__name__) + class TextToSpeech: """ @@ -33,17 +36,28 @@ def __init__( model_name_or_path: Union[str, Path], use_gpu: bool = True, transformers_params: Optional[Dict[str, Any]] = None, + devices: Optional[List[Union[str, torch.device]]] = None, ): """ :param model_name_or_path: The text to speech model, for example `espnet/kan-bayashi_ljspeech_vits`. :param use_gpu: Whether to use GPU (if available). Defaults to True. :param transformers_params: Parameters to pass over to the `Text2Speech.from_pretrained()` call. + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. """ super().__init__() - devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) + resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False) + if len(resolved_devices) > 1: + logger.warning( + f"Multiple devices are not supported in {self.__class__.__name__} inference, " + f"using the first device {resolved_devices[0]}." + ) + self.model = _Text2SpeechModel.from_pretrained( - model_name_or_path, device=devices[0].type, **(transformers_params or {}) + model_name_or_path, device=resolved_devices[0].type, **(transformers_params or {}) ) def text_to_audio_file( diff --git a/haystack/nodes/audio/answer_to_speech.py b/haystack/nodes/audio/answer_to_speech.py index d24fae6c81..8b36241f5e 100644 --- a/haystack/nodes/audio/answer_to_speech.py +++ b/haystack/nodes/audio/answer_to_speech.py @@ -1,6 +1,8 @@ from typing import Union, Optional, List, Dict, Tuple, Any from pathlib import Path + +import torch from tqdm.auto import tqdm from haystack.nodes import BaseComponent @@ -23,6 +25,7 @@ def __init__( audio_params: Optional[Dict[str, Any]] = None, transformers_params: Optional[Dict[str, Any]] = None, progress_bar: bool = True, + devices: Optional[List[Union[str, torch.device]]] = None, ): """ Convert an input Answer into an audio file containing the answer and its context read out loud. @@ -49,9 +52,15 @@ def __init__( By default, the audio file gets the name from the MD5 sum of the input text. :param transformers_params: The parameters to pass over to the `Text2Speech.from_pretrained()` call. :param progress_bar: Whether to show a progress bar while converting the text to audio. + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. """ super().__init__() - self.converter = TextToSpeech(model_name_or_path=model_name_or_path, transformers_params=transformers_params) + self.converter = TextToSpeech( + model_name_or_path=model_name_or_path, transformers_params=transformers_params, devices=devices + ) self.generated_audio_dir = generated_audio_dir self.params: Dict[str, Any] = audio_params or {} self.progress_bar = progress_bar diff --git a/haystack/nodes/document_classifier/transformers.py b/haystack/nodes/document_classifier/transformers.py index c10bfc49ac..3c76da2dcb 100644 --- a/haystack/nodes/document_classifier/transformers.py +++ b/haystack/nodes/document_classifier/transformers.py @@ -2,6 +2,7 @@ import logging import itertools +import torch from tqdm.auto import tqdm from transformers import pipeline @@ -76,6 +77,7 @@ def __init__( classification_field: str = None, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None, ): """ Load a text classification model from Transformers. @@ -110,6 +112,10 @@ def __init__( `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. """ super().__init__() @@ -119,8 +125,12 @@ def __init__( f"zero-shot-classification to use labels." ) - devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) - device = 0 if devices[0].type == "cuda" else -1 + resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False) + if len(resolved_devices) > 1: + logger.warning( + f"Multiple devices are not supported in {self.__class__.__name__} inference, " + f"using the first device {resolved_devices[0]}." + ) if tokenizer is None: tokenizer = model_name_or_path @@ -129,16 +139,16 @@ def __init__( task=task, model=model_name_or_path, tokenizer=tokenizer, - device=device, revision=model_version, use_auth_token=use_auth_token, + device=resolved_devices[0], ) elif task == "text-classification": self.model = pipeline( task=task, model=model_name_or_path, tokenizer=tokenizer, - device=device, + device=resolved_devices[0], revision=model_version, return_all_scores=return_all_scores, use_auth_token=use_auth_token, diff --git a/haystack/nodes/extractor/entity.py b/haystack/nodes/extractor/entity.py index 81bcc9f21b..1eb0033e29 100644 --- a/haystack/nodes/extractor/entity.py +++ b/haystack/nodes/extractor/entity.py @@ -1,5 +1,7 @@ +import logging from typing import List, Union, Dict, Optional, Tuple import itertools +import torch from transformers import AutoTokenizer, AutoModelForTokenClassification from transformers import pipeline @@ -10,6 +12,8 @@ from haystack.modeling.utils import initialize_device_settings from haystack.utils.torch_utils import ListDataset +logger = logging.getLogger(__name__) + class EntityExtractor(BaseComponent): """ @@ -29,6 +33,10 @@ class EntityExtractor(BaseComponent): `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. """ outgoing_edges = 1 @@ -40,10 +48,11 @@ def __init__( batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None, ): super().__init__() - self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) + self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False) self.batch_size = batch_size self.progress_bar = progress_bar @@ -57,9 +66,14 @@ def __init__( model=token_classifier, tokenizer=tokenizer, aggregation_strategy="simple", - device=0 if self.devices[0].type == "cuda" else -1, + device=self.devices[0], use_auth_token=use_auth_token, ) + if len(self.devices) > 1: + logger.warning( + f"Multiple devices are not supported in {self.__class__.__name__} inference, " + f"using the first device {self.devices[0]}." + ) def run(self, documents: Optional[Union[List[Document], List[dict]]] = None) -> Tuple[Dict, str]: # type: ignore """ diff --git a/haystack/nodes/label_generator/pseudo_label_generator.py b/haystack/nodes/label_generator/pseudo_label_generator.py index 8fa8f4d960..190414d140 100644 --- a/haystack/nodes/label_generator/pseudo_label_generator.py +++ b/haystack/nodes/label_generator/pseudo_label_generator.py @@ -1,13 +1,19 @@ +import logging import random from typing import Dict, Iterable, List, Optional, Tuple, Union +import torch from sentence_transformers import CrossEncoder from tqdm.auto import tqdm + +from haystack.modeling.utils import initialize_device_settings from haystack.nodes.base import BaseComponent from haystack.nodes.question_generator import QuestionGenerator from haystack.nodes.retriever.base import BaseRetriever from haystack.schema import Document +logger = logging.getLogger(__name__) + class PseudoLabelGenerator(BaseComponent): """ @@ -62,6 +68,8 @@ def __init__( batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, + use_gpu: bool = True, + devices: Optional[List[Union[str, torch.device]]] = None, ): """ Loads the cross-encoder model and prepares PseudoLabelGenerator. @@ -88,6 +96,10 @@ def __init__( Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained :type use_auth_token: Union[str, bool] (optional) + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit CrossEncoder inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. """ super().__init__() @@ -105,10 +117,18 @@ def __init__( ) else: raise ValueError("Provide either a QuestionGenerator or a non-empty list of questions/document pairs.") + self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False) + if len(self.devices) > 1: + logger.warning( + f"Multiple devices are not supported in {self.__class__.__name__} inference, " + f"using the first device {self.devices[0]}." + ) self.retriever = retriever + self.cross_encoder = CrossEncoder( cross_encoder_model_name_or_path, + device=str(self.devices[0]), tokenizer_args={"use_auth_token": use_auth_token}, automodel_args={"use_auth_token": use_auth_token}, ) diff --git a/haystack/nodes/query_classifier/transformers.py b/haystack/nodes/query_classifier/transformers.py index b834f40211..50cc7d4991 100644 --- a/haystack/nodes/query_classifier/transformers.py +++ b/haystack/nodes/query_classifier/transformers.py @@ -2,10 +2,10 @@ from pathlib import Path from typing import Union, List, Optional, Dict, Any +import torch from transformers import pipeline from tqdm.auto import tqdm -# from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline from haystack.nodes.query_classifier.base import BaseQueryClassifier from haystack.modeling.utils import initialize_device_settings from haystack.utils.torch_utils import ListDataset @@ -71,6 +71,7 @@ def __init__( batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None, ): """ :param model_name_or_path: Directory of a saved model or the name of a public model, for example 'shahrukhx01/bert-mini-finetune-question-detection'. @@ -89,16 +90,25 @@ def __init__( `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. """ super().__init__() - devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) - device = 0 if devices[0].type == "cuda" else -1 + resolved_devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False) + if len(resolved_devices) > 1: + logger.warning( + f"Multiple devices are not supported in {self.__class__.__name__} inference, " + f"using the first device {resolved_devices[0]}." + ) self.model = pipeline( task=task, model=model_name_or_path, tokenizer=tokenizer, - device=device, + device=resolved_devices[0], revision=model_version, use_auth_token=use_auth_token, ) diff --git a/haystack/nodes/question_generator/question_generator.py b/haystack/nodes/question_generator/question_generator.py index d77f573dd4..1704eca71e 100644 --- a/haystack/nodes/question_generator/question_generator.py +++ b/haystack/nodes/question_generator/question_generator.py @@ -1,5 +1,7 @@ +import logging from typing import List, Union, Optional, Iterator import itertools +import torch from tqdm.auto import tqdm from transformers import AutoModelForSeq2SeqLM @@ -11,6 +13,8 @@ from haystack.nodes.preprocessor import PreProcessor from haystack.modeling.utils import initialize_device_settings +logger = logging.getLogger(__name__) + class QuestionGenerator(BaseComponent): """ @@ -43,6 +47,7 @@ def __init__( batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None, ): """ Uses the valhalla/t5-base-e2e-qg model by default. This class supports any question generation model that is @@ -61,9 +66,19 @@ def __init__( `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. + """ super().__init__() - self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) + self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False) + if len(self.devices) > 1: + logger.warning( + f"Multiple devices are not supported in {self.__class__.__name__} inference, " + f"using the first device {self.devices[0]}." + ) self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, use_auth_token=use_auth_token) self.model.to(str(self.devices[0])) self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_auth_token=use_auth_token) diff --git a/haystack/nodes/ranker/sentence_transformers.py b/haystack/nodes/ranker/sentence_transformers.py index 3c1a4ff48f..f86cd93b57 100644 --- a/haystack/nodes/ranker/sentence_transformers.py +++ b/haystack/nodes/ranker/sentence_transformers.py @@ -58,10 +58,6 @@ def __init__( :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :param top_k: The maximum number of documents to return :param use_gpu: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available. - :param devices: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones - The strings will be converted into pytorch devices, so use the string notation described here: - https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device - (e.g. ["cuda:0"]). :param batch_size: Number of documents to process at a time. :param scale_score: The raw predictions will be transformed using a Sigmoid activation function in case the model only predicts a single label. For multi-label predictions, no scaling is applied. Set this @@ -72,15 +68,17 @@ def __init__( `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. """ super().__init__() self.top_k = top_k - if devices is not None: - self.devices = [torch.device(device) for device in devices] - else: - self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=True) + self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=True) + self.progress_bar = progress_bar self.transformer_model = AutoModelForSequenceClassification.from_pretrained( pretrained_model_name_or_path=model_name_or_path, revision=model_version, use_auth_token=use_auth_token diff --git a/haystack/nodes/reader/farm.py b/haystack/nodes/reader/farm.py index fbdb7885f2..2d50cc2f1c 100644 --- a/haystack/nodes/reader/farm.py +++ b/haystack/nodes/reader/farm.py @@ -51,7 +51,7 @@ def __init__( context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, - devices: List[torch.device] = [], + devices: Optional[List[Union[str, torch.device]]] = None, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, @@ -81,8 +81,10 @@ def __init__( Memory consumption is much lower in inference mode. Recommendation: Increase the batch size to a value so only a single batch is used. :param use_gpu: Whether to use GPUs or the CPU. Falls back on CPU if no GPU is available. - :param devices: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]). - Unused if `use_gpu` is False. + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. :param no_ans_boost: How much the no_answer logit is boosted/increased. If set to 0 (default), the no_answer logit is not changed. If a negative number, there is a lower chance of "no_answer" being predicted. @@ -382,8 +384,10 @@ def train( :param dev_split: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here that gets split off from training data for eval. :param use_gpu: Whether to use GPU (if available) - :param devices: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]). - Unused if `use_gpu` is False. + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. :param batch_size: Number of samples the model receives in one batch for training :param n_epochs: Number of iterations on the whole training data set :param learning_rate: Learning rate of the optimizer @@ -497,8 +501,10 @@ def distil_prediction_layer_from( :param dev_split: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here that gets split off from training data for eval. :param use_gpu: Whether to use GPU (if available) - :param devices: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]). - Unused if `use_gpu` is False. + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. :param student_batch_size: Number of samples the student model receives in one batch for training :param student_batch_size: Number of samples the teacher model receives in one batch for distillation :param n_epochs: Number of iterations on the whole training data set @@ -621,8 +627,10 @@ def distil_intermediate_layers_from( :param dev_split: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here that gets split off from training data for eval. :param use_gpu: Whether to use GPU (if available) - :param devices: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]). - Unused if `use_gpu` is False. + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. :param student_batch_size: Number of samples the student model receives in one batch for training :param student_batch_size: Number of samples the teacher model receives in one batch for distillation :param n_epochs: Number of iterations on the whole training data set diff --git a/haystack/nodes/reader/table.py b/haystack/nodes/reader/table.py index 28c3d52fee..e20cc0bde9 100644 --- a/haystack/nodes/reader/table.py +++ b/haystack/nodes/reader/table.py @@ -73,6 +73,7 @@ def __init__( return_no_answer: bool = False, max_seq_len: int = 256, use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None, ): """ Load a TableQA model from Transformers. @@ -110,6 +111,10 @@ def __init__( `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. """ if not torch_scatter_installed: raise ImportError( @@ -122,8 +127,14 @@ def __init__( ) super().__init__() - self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) + self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False) config = TapasConfig.from_pretrained(model_name_or_path, use_auth_token=use_auth_token) + if len(self.devices) > 1: + logger.warning( + f"Multiple devices are not supported in {self.__class__.__name__} inference, " + f"using the first device {self.devices[0]}." + ) + if config.architectures[0] == "TapasForScoredQA": self.model = self.TapasForScoredQA.from_pretrained( model_name_or_path, revision=model_version, use_auth_token=use_auth_token @@ -583,6 +594,12 @@ def __init__( super().__init__() self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) + if len(self.devices) > 1: + logger.warning( + f"Multiple devices are not supported in {self.__class__.__name__} inference, " + f"using the first device {self.devices[0]}." + ) + self.row_model = AutoModelForSequenceClassification.from_pretrained( row_model_name_or_path, revision=row_model_version, use_auth_token=use_auth_token ) diff --git a/haystack/nodes/reader/transformers.py b/haystack/nodes/reader/transformers.py index 6d78a16b55..cc842be85b 100644 --- a/haystack/nodes/reader/transformers.py +++ b/haystack/nodes/reader/transformers.py @@ -3,6 +3,7 @@ import logging import itertools +import torch from transformers import pipeline from transformers.data.processors.squad import SquadExample @@ -37,6 +38,7 @@ def __init__( doc_stride: int = 128, batch_size: int = 16, use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None, ): """ Load a QA model from Transformers. @@ -72,16 +74,27 @@ def __init__( `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. """ super().__init__() - self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) - device = 0 if self.devices[0].type == "cuda" else -1 + self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False) + + if len(self.devices) > 1: + logger.warning( + f"Multiple devices are not supported in {self.__class__.__name__} inference, " + f"using the first device {self.devices[0]}." + ) + self.model = pipeline( "question-answering", model=model_name_or_path, tokenizer=tokenizer, - device=device, + device=self.devices[0], revision=model_version, use_auth_token=use_auth_token, ) diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py index 3009a0f927..3ae3829d42 100644 --- a/haystack/nodes/retriever/dense.py +++ b/haystack/nodes/retriever/dense.py @@ -113,10 +113,11 @@ def __init__( Increase if errors like "encoded data exceeds max_size ..." come up :param progress_bar: Whether to show a tqdm progress bar or not. Can be helpful to disable in production deployments to keep the logs clean. - :param devices: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones - These strings will be converted into pytorch devices, so use the string notation described here: - https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device - (e.g. ["cuda:0"]). Note: as multi-GPU training is currently not implemented for DPR, training + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. + Note: as multi-GPU training is currently not implemented for DPR, training will only use the first device provided in this list. :param use_auth_token: The API token used to download private models from Huggingface. If this parameter is set to `True`, then the token generated when running @@ -129,13 +130,10 @@ def __init__( """ super().__init__() - if devices is not None: - self.devices = [torch.device(device) for device in devices] - else: - self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=True) + self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=True) if batch_size < len(self.devices): - logger.warning("Batch size is less than the number of devices. All gpus will not be utilized.") + logger.warning("Batch size is less than the number of devices.All gpus will not be utilized.") self.document_store = document_store self.batch_size = batch_size @@ -820,10 +818,11 @@ def __init__( Increase if errors like "encoded data exceeds max_size ..." come up :param progress_bar: Whether to show a tqdm progress bar or not. Can be helpful to disable in production deployments to keep the logs clean. - :param devices: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones - These strings will be converted into pytorch devices, so use the string notation described here: - https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device - (e.g. ["cuda:0"]). Note: as multi-GPU training is currently not implemented for TableTextRetriever, + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. + Note: as multi-GPU training is currently not implemented for TableTextRetriever, training will only use the first device provided in this list. :param use_auth_token: The API token used to download private models from Huggingface. If this parameter is set to `True`, then the token generated when running @@ -837,13 +836,10 @@ def __init__( """ super().__init__() - if devices is not None: - self.devices = [torch.device(device) for device in devices] - else: - self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=True) + self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=True) if batch_size < len(self.devices): - logger.warning("Batch size is less than the number of devices. All gpus will not be utilized.") + logger.warning("Batch size is less than the number of devices.All gpus will not be utilized.") self.document_store = document_store self.batch_size = batch_size @@ -1489,10 +1485,11 @@ def __init__( Default: -1 (very last layer). :param top_k: How many documents to return per query. :param progress_bar: If true displays progress bar during embedding. - :param devices: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones - These strings will be converted into pytorch devices, so use the string notation described here: - https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device - (e.g. ["cuda:0"]). Note: As multi-GPU training is currently not implemented for EmbeddingRetriever, + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. + Note: As multi-GPU training is currently not implemented for EmbeddingRetriever, training will only use the first device provided in this list. :param use_auth_token: The API token used to download private models from Huggingface. If this parameter is set to `True`, then the token generated when running @@ -1510,13 +1507,10 @@ def __init__( """ super().__init__() - if devices is not None: - self.devices = [torch.device(device) for device in devices] - else: - self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=True) + self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=True) if batch_size < len(self.devices): - logger.warning("Batch size is less than the number of devices. All gpus will not be utilized.") + logger.warning("Batch size is less than the number of devices.All gpus will not be utilized.") self.document_store = document_store self.embedding_model = embedding_model @@ -1965,10 +1959,11 @@ def __init__( Default: -1 (very last layer). :param top_k: How many documents to return per query. :param progress_bar: If true displays progress bar during embedding. - :param devices: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones - These strings will be converted into pytorch devices, so use the string notation described here: - https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device - (e.g. ["cuda:0"]). Note: As multi-GPU training is currently not implemented for EmbeddingRetriever, + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. + Note: As multi-GPU training is currently not implemented for EmbeddingRetriever, training will only use the first device provided in this list. :param use_auth_token: The API token used to download private models from Huggingface. If this parameter is set to `True`, then the token generated when running diff --git a/haystack/nodes/summarizer/transformers.py b/haystack/nodes/summarizer/transformers.py index 8e79ef34bb..9fc3d8068d 100644 --- a/haystack/nodes/summarizer/transformers.py +++ b/haystack/nodes/summarizer/transformers.py @@ -3,6 +3,7 @@ import logging +import torch from tqdm.auto import tqdm from transformers import pipeline from transformers.models.auto.modeling_auto import AutoModelForSeq2SeqLM @@ -66,6 +67,7 @@ def __init__( batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None, ): """ Load a Summarization model from Transformers. @@ -94,11 +96,20 @@ def __init__( `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. """ super().__init__() - self.devices, _ = initialize_device_settings(use_cuda=use_gpu) - device = 0 if self.devices[0].type == "cuda" else -1 + self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False) + if len(self.devices) > 1: + logger.warning( + f"Multiple devices are not supported in {self.__class__.__name__} inference, " + f"using the first device {self.devices[0]}." + ) + # TODO AutoModelForSeq2SeqLM is only necessary with transformers==4.1.1, with newer versions use the pipeline directly if tokenizer is None: tokenizer = model_name_or_path @@ -106,7 +117,7 @@ def __init__( pretrained_model_name_or_path=model_name_or_path, revision=model_version, use_auth_token=use_auth_token ) self.summarizer = pipeline( - "summarization", model=model, tokenizer=tokenizer, device=device, use_auth_token=use_auth_token + "summarization", model=model, tokenizer=tokenizer, device=self.devices[0], use_auth_token=use_auth_token ) self.max_length = max_length self.min_length = min_length diff --git a/haystack/nodes/translator/transformers.py b/haystack/nodes/translator/transformers.py index 464c859a9b..69a9f3aaa8 100644 --- a/haystack/nodes/translator/transformers.py +++ b/haystack/nodes/translator/transformers.py @@ -2,6 +2,7 @@ from copy import deepcopy from typing import Any, Dict, List, Optional, Union +import torch from tqdm.auto import tqdm from transformers import AutoModelForSeq2SeqLM, AutoTokenizer # type: ignore @@ -44,6 +45,7 @@ def __init__( use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None, ): """Initialize the translator with a model that fits your targeted languages. While we support all seq2seq models from Hugging Face's model hub, we recommend using the OPUS models from Helsinki NLP. They provide plenty @@ -70,10 +72,21 @@ def __init__( `transformers-cli login` (stored in ~/.huggingface) will be used. Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. """ super().__init__() - self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) + self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False) + if len(self.devices) > 1: + logger.warning( + f"Multiple devices are not supported in {self.__class__.__name__} inference, " + f"using the first device {self.devices[0]}." + ) + self.max_seq_len = max_seq_len self.clean_up_tokenization_spaces = clean_up_tokenization_spaces self.progress_bar = progress_bar