diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml index 67564c43556d..d9523eaf5da5 100644 --- a/docs/source/ar/_toctree.yml +++ b/docs/source/ar/_toctree.yml @@ -127,16 +127,16 @@ title: التصدير إلى ONNX - local: tflite title: التصدير إلى TFLite -# - local: torchscript -# title: التصدير إلى TorchScript + - local: torchscript + title: التصدير إلى TorchScript # - local: benchmarks # title: المعايير # - local: notebooks # title: دفاتر الملاحظات مع الأمثلة # - local: community # title: موارد المجتمع -# - local: troubleshooting -# title: استكشاف الأخطاء وإصلاحها + - local: troubleshooting + title: استكشاف الأخطاء وإصلاحها - local: gguf title: التوافق مع ملفات GGUF title: أدلة المطورين diff --git a/docs/source/ar/model_sharing.md b/docs/source/ar/model_sharing.md index 620261a0c58a..b802eb3ef038 100644 --- a/docs/source/ar/model_sharing.md +++ b/docs/source/ar/model_sharing.md @@ -28,7 +28,7 @@ picture-in-picture" allowfullscreen> ```py >>> model = AutoModel.from_pretrained( -... "julien-c/EsperBERTo-small", revision="v2.0.1" # اسم العلامة، أو اسم الفرع، أو تجزئة الالتزام +... "julien-c/EsperBERTo-small", revision="4c77982" # اسم العلامة، أو اسم الفرع، أو تجزئة الالتزام ... ) ``` diff --git a/docs/source/ar/torchscript.md b/docs/source/ar/torchscript.md new file mode 100644 index 000000000000..bf0bc0dde04b --- /dev/null +++ b/docs/source/ar/torchscript.md @@ -0,0 +1,154 @@ +# التصدير إلى TorchScript + + + +هذه هي بداية تجاربنا مع TorchScript ولا زلنا نستكشف قدراته مع نماذج المدخلات المتغيرة الحجم. إنه مجال اهتمامنا وسنعمق تحليلنا في الإصدارات القادمة، مع المزيد من الأمثلة البرمجية، وتنفيذ أكثر مرونة، ومقاييس مقارنة بين الأكواد القائمة على Python مع أكواد TorchScript المُجمّعة. + + + +وفقًا لـ [وثائق TorchScript](https://pytorch.org/docs/stable/jit.html): + +> TorchScript هي طريقة لإنشاء نماذج قابلة للتسلسل والتحسين من تعليمات PyTorch البرمجية. + +هناك وحدتان من PyTorch، [JIT and TRACE](https://pytorch.org/docs/stable/jit.html)، تتيحان للمطورين تصدير نماذجهم لإعادة استخدامها في برامج أخرى مثل برامج C++ المُحسّنة للأداء. + +نقدم واجهة تتيح لك تصدير نماذج 🤗 Transformers إلى TorchScript بحيث يمكن إعادة استخدامها في بيئة مختلفة عن برامج Python القائمة إلى PyTorch. هنا نشرح كيفية تصدير نماذجنا واستخدامها باستخدام TorchScript. + +يتطلب تصدير نموذج أمرين: + +- تهيئة مثيل للنموذج باستخدام علامة `torchscript` +- تمرير مُدخلات وهمية (dummy inputs) خلال النموذج + +تنطوي هذه الضرورات على عدة أمور يجب على المطورين توخي الحذر بشأنها كما هو مفصل أدناه. + +## علامة TorchScript والأوزان المرتبطة + +علامة `torchscript` ضرورية لأن معظم نماذج اللغة 🤗 Transformers لها أوزان مرتبطة بين طبقة `Embedding` وطبقة `Decoding`. لا يسمح لك TorchScript بتصدير النماذج ذات الأوزان المرتبطة، لذلك من الضروري فصل الأوزان ونسخها مسبقًا. + +النماذج المُهيأة باستخدام علامة `torchscript` لها طبقة `Embedding` وطبقة`Decoding` منفصلتين، مما يعني أنه لا ينبغي تدريبها لاحقًا. سيؤدي التدريب إلى عدم تزامن الطبقتين، مما يؤدي إلى نتائج غير متوقعة. + +هذا لا ينطبق على النماذج التي لا تحتوي على رأس نموذج اللغة، حيث لا تملك أوزانًا مرتبطة. يمكن تصدير هذه النماذج بأمان دون علامة `torchscript`. + +## المدخلات الوهمية والأطوال القياسية + +تُستخدم المُدخلات الوهمية لتمرير أمامي خلال النموذج. أثناء انتشار قيم المُدخلات عبر الطبقات، يتتبع PyTorch العمليات المختلفة التي يتم تنفيذها على كل مصفوفة(tensor). ثم يتم استخدام هذه العمليات المُسجلة بعد ذلك لإنشاء *أثر* النموذج. + +يتم إنشاء التتبع بالنسبة لأبعاد المُدخلات. وبالتالي، فهو مُقيّد بأبعاد المُدخلات الوهمية، ولن يعمل لأي طول تسلسل أو حجم دفعة مختلف. عند المحاولة بحجم مختلف، يتم رفع الخطأ التالي: + +``` +`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2` +``` + +نوصي بتتبع النموذج باستخدام حجم مُدخلات وهمية لا يقل عن أكبر مُدخل سيتم تقديمه للنموذج أثناء الاستدلال. يمكن أن تساعد الحشوة(padding) في ملء القيم المفقودة. ومع ذلك، نظرًا لتتبع النموذج بحجم مُدخل أكبر، ستكون أبعاد المصفوفة ستكون كبيرة أيضًا، مما يؤدي عنه المزيد من الحسابات. + +انتبه إلى إجمالي عدد العمليات المُنفذة على كل مُدخل وتابع الأداء عن كثب عند تصدير نماذج متغيرة طول التسلسل. + +## استخدام TorchScript في Python + +يوضح هذا القسم كيفية حفظ النماذج وتحميلها، بالإضافة إلى كيفية استخدام التتبع للاستدلال. + +### حفظ نموذج + +لتصدير `BertModel` باستخدام TorchScript، قم بتهيئة ـ `BertModel` من فئة `BertConfig` ثم احفظه على القرص تحت اسم الملف `traced_bert.pt`: + +```python +from transformers import BertModel, BertTokenizer, BertConfig +import torch + +enc = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") + +# Tokenizing input text +text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" +tokenized_text = enc.tokenize(text) + +# Masking one of the input tokens +masked_index = 8 +tokenized_text[masked_index] = "[MASK]" +indexed_tokens = enc.convert_tokens_to_ids(tokenized_text) +segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] + +# Creating a dummy input +tokens_tensor = torch.tensor([indexed_tokens]) +segments_tensors = torch.tensor([segments_ids]) +dummy_input = [tokens_tensor, segments_tensors] + +# Initializing the model with the torchscript flag +# Flag set to True even though it is not necessary as this model does not have an LM Head. +config = BertConfig( + vocab_size_or_config_json_file=32000, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + torchscript=True, +) + +# Instantiating the model +model = BertModel(config) + +# The model needs to be in evaluation mode +model.eval() + +# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag +model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True) + +# Creating the trace +traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors]) +torch.jit.save(traced_model, "traced_bert.pt") +``` + +### تحميل نموذج + +يمكنك الآن تحميل `BertModel` المُحفظ سابقًا، `traced_bert.pt`، من القرص واستخدامه على `dummy_input` المُهيأ سابقًا: + +```python +loaded_model = torch.jit.load("traced_bert.pt") +loaded_model.eval() + +all_encoder_layers, pooled_output = loaded_model(*dummy_input) +``` + +### استخدام نموذج مُتتبع للاستدلال + +استخدم النموذج المُتتبع للاستدلال باستخدام أسلوب `__call__` الخاص به: + +```python +traced_model(tokens_tensor, segments_tensors) +``` + +## نشر نماذج Hugging Face TorchScript على AWS باستخدام Neuron SDK + +قدمت AWS عائلة [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) من اﻷجهزة لخفض التكلفة وأداء التعلم الآلي عالي الأداء في البيئة السحابية. تعمل أجهزة Inf1 بواسطة شريحة Inferentia من AWS، وهي مُسرّع أجهزة مُخصص، متخصص في أعباء عمل الاستدلال للتعلم العميق. [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) هي SDK لـ Inferentia التي تدعم تتبع نماذج المحولات وتحسينها للنشر على Inf1. توفر Neuron SDK ما يلي: + +1. واجهة برمجة تطبيقات سهلة الاستخدام مع تغيير سطر واحد من التعليمات البرمجية لتتبع نموذج TorchScript وتحسينه للاستدلال في البيئة السحابية. +2. تحسينات الأداء الجاهزة للاستخدام [تحسين التكلفة والأداء](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>). +3. دعم نماذج Hugging Face المحولات المبنية باستخدام إما [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) أو [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html). + +### الآثار المترتبة + +تعمل نماذج المحولات المستندة إلى بنية [BERT (تمثيلات الترميز ثنائية الاتجاه من المحولات)](https://huggingface.co/docs/transformers/main/model_doc/bert) أو متغيراتها مثل [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) و [roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta) بشكل أفضل على Inf1 للمهام غير التوليدية مثل الإجابة على الأسئلة الاستخراجية، وتصنيف التسلسلات، وتصنيف الرموز (tokens). ومع ذلك، يمكن تكييف مهام توليد النصوص للعمل على Inf1 وفقًا لهذا [برنامج تعليمي AWS Neuron MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html). يمكن العثور على مزيد من المعلومات حول النماذج التي يمكن تحويلها جاهزة على Inferentia في قسم [ملاءمة بنية النموذج](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia) من وثائق Neuron. + +### التبعيات (Dependencies) + +يتطلب استخدام AWS Neuron لتحويل النماذج [بيئة SDK Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide) والتي تأتي مسبقًا على [AMI للتعلم العميق من AWS](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html). + +### تحويل نموذج لـ AWS Neuron + +قم بتحويل نموذج لـ AWS NEURON باستخدام نفس التعليمات البرمجية من [استخدام TorchScript في Python](torchscript#using-torchscript-in-python) لتتبع `BertModel`. قم باستيراد امتداد إطار عمل `torch.neuron` للوصول إلى مكونات Neuron SDK من خلال واجهة برمجة تطبيقات Python: + +```python +from transformers import BertModel, BertTokenizer, BertConfig +import torch +import torch.neuron +``` + +كل ما عليك فعله هو تعديل السطر التالي: + +```diff +- torch.jit.trace(model, [tokens_tensor, segments_tensors]) ++ torch.neuron.trace(model, [token_tensor, segments_tensors]) +``` + +يتيح ذلك لـ Neuron SDK تتبع النموذج وتحسينه لمثيلات Inf1. + +لمعرفة المزيد حول ميزات AWS Neuron SDK والأدوات ودروس البرامج التعليمية والتحديثات الأخيرة، يرجى الاطلاع على [وثائق AWS NeuronSDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html). diff --git a/docs/source/ar/troubleshooting.md b/docs/source/ar/troubleshooting.md new file mode 100644 index 000000000000..7874a9fad133 --- /dev/null +++ b/docs/source/ar/troubleshooting.md @@ -0,0 +1,171 @@ +# استكشاف الأخطاء وإصلاحها + +تحدث الأخطاء أحيانًا، لكننا هنا للمساعدة! يغطي هذا الدليل بعض المشكلات الأكثر شيوعًا التي واجهناها وكيفية حلها. مع ذلك، لا يُقصد بهذا الدليل أن يكون مجموعة شاملة لكل مشكلات 🤗 Transformers. لمزيد من المساعدة في استكشاف مشكلتك وإصلاحها، جرب ما يلي: + + + +1. اطلب المساعدة على [المنتديات](https://discuss.huggingface.co/). هناك فئات محددة يمكنك نشر سؤالك فيها، مثل [المبتدئين](https://discuss.huggingface.co/c/beginners/5) أو [🤗 Transformers](https://discuss.huggingface.co/c/transformers/9). تأكد من كتابة منشور جيد وواضح على المنتدى مع بعض التعليمات البرمجية القابلة للتكرار لزيادة احتمالية حل مشكلتك! + + +2. قم بإنشاء [مشكلة](https://github.com/huggingface/transformers/issues/new/choose) في مستودع 🤗 Transformers إذا كانت هناك مشكلة متعلقة بالمكتبة. حاول تضمين أكبر قدر ممكن من المعلومات التي تصف المشكلة لمساعدتنا في معرفة ما هو الخطأ وكيفية إصلاحه. + +3. تحقق من دليل [الترحيل](migration) إذا كنت تستخدم إصدارًا أقدم من مكتبة 🤗 Transformers حيث تم إدخال بعض التغييرات المهمة بين الإصدارات. + + +للحصول على مزيد من التفاصيل حول استكشاف الأخطاء وإصلاحها والحصول على المساعدة، راجع [الفصل 8](https://huggingface.co/course/chapter8/1?fw=pt) من دورة Hugging Face. + +## بيئات جدار الحماية + +بعض وحدات معالجة الرسومات (GPU) على السحابة وإعدادات الشبكة الداخلية محمية بجدار حماية من الاتصالات الخارجية، مما يؤدي إلى حدوث خطأ في الاتصال. عندما تحاول تعليمات البرنامج النصي تنزيل أوزان النموذج أو مجموعات البيانات، سيتوقف التنزيل ثم ينتهي بخطأ مثل: + +``` +ValueError: Connection error, and we cannot find the requested files in the cached path. +Please try again or make sure your Internet connection is on. +``` + +في هذه الحالة، يجب محاولة تشغيل 🤗 Transformers في [وضع عدم الاتصال](installation#offline-mode) لتجنب خطأ الاتصال. + +## CUDA نفاد الذاكرة + +يمكن أن يكون تدريب النماذج الكبيرة التي تحتوي على ملايين المعلمات أمرًا صعبًا بدون الأجهزة المناسبة. أحد الأخطاء الشائعة التي قد تواجهها عند نفاد ذاكرة GPU هو: + +``` +CUDA out of memory. Tried to allocate 256.00 MiB (GPU 0; 11.17 GiB total capacity; 9.70 GiB already allocated; 179.81 MiB free; 9.85 GiB reserved in total by PyTorch) +``` + +فيما يلي بعض الحلول المحتملة التي يمكنك تجربتها لتقليل استخدام الذاكرة: + +- قلل من قيمة [`per_device_train_batch_size`](main_classes/trainer#transformers.TrainingArguments.per_device_train_batch_size) في [`TrainingArguments`]. + +- حاول استخدام [`gradient_accumulation_steps`](main_classes/trainer#transformers.TrainingArguments.gradient_accumulation_steps) في [`TrainingArguments`] لزيادة حجم الدُفعة بشكل فعال. + + +راجع دليل [الأداء](performance) لمزيد من التفاصيل حول تقنيات توفير الذاكرة. + + +## عدم القدرة على تحميل نموذج TensorFlow محفوظ + +تقوم طريقة TensorFlow [model.save](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model) بحفظ النموذج بالكامل - الهندسة المعمارية، الأوزان، تكوين التدريب - في ملف واحد. ومع ذلك، عند تحميل ملف النموذج مرة أخرى، قد تواجه خطأ لأن مكتبة 🤗 Transformers قد لا تقوم بتحميل جميع الكائنات المتعلقة بـ TensorFlow في ملف النموذج. لتجنب المشكلات المتعلقة بحفظ وتحميل نماذج TensorFlow، نوصي بما يلي: + +- احفظ أوزان النموذج كملف `h5` باستخدام [`model.save_weights`](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model) ثم أعد تحميل النموذج باستخدام [`~TFPreTrainedModel.from_pretrained`]: + +```python +>>> from transformers import TFPreTrainedModel +>>> from tensorflow import keras + +>>> model.save_weights("some_folder/tf_model.h5") +>>> model = TFPreTrainedModel.from_pretrained("some_folder") +``` + +- احفظ النموذج باستخدام [`~TFPretrainedModel.save_pretrained`] وقم بتحميله مرة أخرى باستخدام [`~TFPreTrainedModel.from_pretrained`]: + +```python +>>> from transformers import TFPreTrainedModel + +>>> model.save_pretrained("path_to/model") +>>> model = TFPreTrainedModel.from_pretrained("path_to/model") +``` + +## ImportError + +خطأ شائع آخر قد تواجهه، خاصة إذا كان نموذجًا تم إصداره حديثًا، هو `ImportError`: + +``` +ImportError: cannot import name 'ImageGPTImageProcessor' from 'transformers' (unknown location) +``` + +بالنسبة لأنواع الأخطاء هذه، تحقق من أن لديك أحدث إصدار من مكتبة Hugging Face Transformers مثبتًا للوصول إلى أحدث النماذج: + +```bash +pip install transformers --upgrade +``` + +## خطأ CUDA: تم تشغيل التأكيد على جانب الجهاز + +في بعض الأحيان، قد تواجه خطأ CUDA عامًا حول خطأ في كود الجهاز. + +``` +RuntimeError: CUDA error: device-side assert triggered +``` + +يجب عليك محاولة تشغيل الكود على وحدة المعالجة المركزية (CPU) أولاً للحصول على رسالة خطأ أكثر دقة. أضف متغير البيئة التالي في بداية كودك للتبديل إلى وحدة المعالجة المركزية: + +```python +>>> import os + +>>> os.environ["CUDA_VISIBLE_DEVICES"] = "" +``` + +الخيار الآخر هو الحصول على تتبع مكدس أفضل من GPU. أضف متغير البيئة التالي في بداية كودك للحصول على تتبع المكدس للإشارة إلى مصدر الخطأ: + +```python +>>> import os + +>>> os.environ["CUDA_LAUNCH_BLOCKING"] = "1" +``` + +## إخراج غير صحيح عند عدم إخفاء رموز الحشو + +في بعض الحالات، قد يكون `hidden_state` غير صحيحة إذا تضمنت `input_ids` رموز حشو. ولإثبات ذلك، قم بتحميل نموذج ومجزىء لغوى. يمكنك الوصول إلى `pad_token_id` للنموذج لمعرفة قيمته. قد تكون `pad_token_id` `None` لبعض النماذج، ولكن يمكنك دائمًا تعيينها يدويًا. + +```python +>>> from transformers import AutoModelForSequenceClassification +>>> import torch + +>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased") +>>> model.config.pad_token_id +0 +``` + +يوضح المثال التالي المُخرجات بدون إخفاء رموز الحشو: + +```python +>>> input_ids = torch.tensor([[7592, 2057, 2097, 2393, 9611, 2115], [7592, 0, 0, 0, 0, 0]]) +>>> output = model(input_ids) +>>> print(output.logits) +tensor([[ 0.0082, -0.2307], +[ 0.1317, -0.1683]], grad_fn=) +``` + +هنا المُخرجات الفعلية للتسلسل الثاني: + +```python +>>> input_ids = torch.tensor([[7592]]) +>>> output = model(input_ids) +>>> print(output.logits) +tensor([[-0.1008, -0.4061]], grad_fn=) +``` + +يجب عليك في معظم الوقت توفير `attention_mask` للنموذج لتجاهل رموز الحشو لتجنب هذا الخطأ الصامت. الآن يتطابق مُخرجات التسلسل الثاني مع مُخرجاته الفعلية: + + +بشكل افتراضي، ينشئ مجزىء النصوص `attention_mask` لك استنادًا إلى إعدادات المجزىء المحدد. + + +```python +>>> attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0]]) +>>> output = model(input_ids, attention_mask=attention_mask) +>>> print(output.logits) +tensor([[ 0.0082, -0.2307], +[-0.1008, -0.4061]], grad_fn=) +``` + +لا ينشئ 🤗 Transformers تلقائيًا `attention_mask` لإخفاء رمز الحشو إذا تم توفيره لأن: + +- بعض النماذج ليس لها رمز حشو. + +- بالنسبة لبعض الاستخدامات، يريد المستخدمون أن ينتبه النموذج إلى رمز الحشو. +## ValueError: فئة التكوين غير المعترف بها XYZ لهذا النوع من AutoModel + +بشكل عام، نوصي باستخدام فئة [`AutoModel`] لتحميل النسخ المدربة مسبقًا من النماذج. يمكن لهذه الفئة أن تستنتج وتُحمل تلقائيًا البنية الصحيحة من نسخ معينة بناءً على التكوين. إذا رأيت هذا الخطأ `ValueError` عند تحميل نموذج من نسخة، فهذا يعني أن الفئة التلقائية (Auto) لم تتمكن من العثور على خريطة من التكوين في نقطة التفتيش المعطاة إلى نوع النموذج الذي تُحاول تحميله. وغالبًا ما يحدث هذا عندما لا تدعم نقطة التفتيش مهمة معينة. + +على سبيل المثال، سترى هذا الخطأ في المثال التالي لأنه لا يوجد GPT2 للإجابة على الأسئلة: + +```py +>>> from transformers import AutoProcessor, AutoModelForQuestionAnswering + +>>> processor = AutoProcessor.from_pretrained("openai-community/gpt2-medium") +>>> model = AutoModelForQuestionAnswering.from_pretrained("openai-community/gpt2-medium") +ValueError: Unrecognized configuration class for this kind of AutoModel: AutoModelForQuestionAnswering. +Model type should be one of AlbertConfig, BartConfig, BertConfig, BigBirdConfig, BigBirdPegasusConfig, BloomConfig, ... +``` diff --git a/docs/source/de/model_sharing.md b/docs/source/de/model_sharing.md index 6bbb6e10cb49..850d9a3454a9 100644 --- a/docs/source/de/model_sharing.md +++ b/docs/source/de/model_sharing.md @@ -43,7 +43,7 @@ Folglich können Sie eine bestimmte Modellversion mit dem Parameter "Revision" l ```py >>> model = AutoModel.from_pretrained( -... "julien-c/EsperBERTo-small", revision="v2.0.1" # tag name, or branch name, or commit hash +... "julien-c/EsperBERTo-small", revision="4c77982" # tag name, or branch name, or commit hash ... ) ``` diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index a7806059afaa..ca7ee4557fee 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -218,6 +218,8 @@ title: CPU inference - local: perf_infer_gpu_one title: GPU inference + - local: perf_infer_gpu_multi + title: Multi-GPU inference title: Optimizing inference - local: big_models title: Instantiate a big model @@ -514,6 +516,8 @@ title: Nyströmformer - local: model_doc/olmo title: OLMo + - local: model_doc/olmo_1124 + title: OLMo November 2024 - local: model_doc/olmoe title: OLMoE - local: model_doc/open-llama diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md index 64ded9613716..621edeb20e8e 100644 --- a/docs/source/en/generation_strategies.md +++ b/docs/source/en/generation_strategies.md @@ -403,7 +403,7 @@ culture, and they allow us to design the' This guide illustrates the main parameters that enable various decoding strategies. More advanced parameters exist for the [`generate`] method, which gives you even further control over the [`generate`] method's behavior. -For the complete list of the available parameters, refer to the [API documentation](./main_classes/text_generation.md). +For the complete list of the available parameters, refer to the [API documentation](./main_classes/text_generation). ### Speculative Decoding @@ -508,10 +508,11 @@ See the following examples for DoLa decoding with the 32-layer LLaMA-7B model. ```python >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed >>> import torch +>>> from accelerate.test_utils.testing import get_backend >>> tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") >>> model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", torch_dtype=torch.float16) ->>> device = 'cuda' if torch.cuda.is_available() else 'cpu' +>>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) >>> model.to(device) >>> set_seed(42) diff --git a/docs/source/en/index.md b/docs/source/en/index.md index aaff45ab65df..341cb417c7b8 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -240,6 +240,7 @@ Flax), PyTorch, and/or TensorFlow. | [Nougat](model_doc/nougat) | ✅ | ✅ | ✅ | | [Nyströmformer](model_doc/nystromformer) | ✅ | ❌ | ❌ | | [OLMo](model_doc/olmo) | ✅ | ❌ | ❌ | +| [OLMo November 2024](model_doc/olmo_1124) | ✅ | ❌ | ❌ | | [OLMoE](model_doc/olmoe) | ✅ | ❌ | ❌ | | [OmDet-Turbo](model_doc/omdet-turbo) | ✅ | ❌ | ❌ | | [OneFormer](model_doc/oneformer) | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/blip-2.md b/docs/source/en/model_doc/blip-2.md index b57c69ca6b32..4125d372d55a 100644 --- a/docs/source/en/model_doc/blip-2.md +++ b/docs/source/en/model_doc/blip-2.md @@ -40,6 +40,10 @@ The original code can be found [here](https://github.com/salesforce/LAVIS/tree/5 - BLIP-2 can be used for conditional text generation given an image and an optional text prompt. At inference time, it's recommended to use the [`generate`] method. - One can use [`Blip2Processor`] to prepare images for the model, and decode the predicted tokens ID's back to text. +> [!NOTE] +> BLIP models after release v4.46 will raise warnings about adding `processor.num_query_tokens = {{num_query_tokens}}` and expand model embeddings layer to add special `` token. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that BLIP will add the number of query tokens required per image and expand the text with as many `` placeholders as there will be query tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there wil be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.num_query_tokens` and model embeddings expansion can be done by following [this link](https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042). + ## Resources A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BLIP-2. diff --git a/docs/source/en/model_doc/instructblip.md b/docs/source/en/model_doc/instructblip.md index b5fc634b6216..904a96bc786f 100644 --- a/docs/source/en/model_doc/instructblip.md +++ b/docs/source/en/model_doc/instructblip.md @@ -33,6 +33,10 @@ The original code can be found [here](https://github.com/salesforce/LAVIS/tree/m InstructBLIP uses the same architecture as [BLIP-2](blip2) with a tiny but important difference: it also feeds the text prompt (instruction) to the Q-Former. +> [!NOTE] +> BLIP models after release v4.46 will raise warnings about adding `processor.num_query_tokens = {{num_query_tokens}}` and expand model embeddings layer to add special `` token. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that BLIP will add the number of query tokens required per image and expand the text with as many `` placeholders as there will be query tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there wil be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.num_query_tokens` and model embeddings expansion can be done by following [this link](https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042). + ## InstructBlipConfig [[autodoc]] InstructBlipConfig diff --git a/docs/source/en/model_doc/instructblipvideo.md b/docs/source/en/model_doc/instructblipvideo.md index aa93feb6b6dc..8b2207ce1765 100644 --- a/docs/source/en/model_doc/instructblipvideo.md +++ b/docs/source/en/model_doc/instructblipvideo.md @@ -35,6 +35,10 @@ The original code can be found [here](https://github.com/salesforce/LAVIS/tree/m - The model was trained by sampling 4 frames per video, so it's recommended to sample 4 frames +> [!NOTE] +> BLIP models after release v4.46 will raise warnings about adding `processor.num_query_tokens = {{num_query_tokens}}` and expand model embeddings layer to add special `` token. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that BLIP will add the number of query tokens required per image and expand the text with as many `` placeholders as there will be query tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there wil be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.num_query_tokens` and model embeddings expansion can be done by following [this link](https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042). + ## InstructBlipVideoConfig [[autodoc]] InstructBlipVideoConfig diff --git a/docs/source/en/model_doc/llava.md b/docs/source/en/model_doc/llava.md index 99950a2ffd8e..dec19ca5ef45 100644 --- a/docs/source/en/model_doc/llava.md +++ b/docs/source/en/model_doc/llava.md @@ -40,6 +40,13 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/ - Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results. + +> [!NOTE] +> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. +Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches. + + ### Single image inference For best results, we recommend users to use the processor's `apply_chat_template()` method to format your prompt correctly. For that you need to construct a conversation history, passing in a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities, as follows: @@ -85,10 +92,10 @@ LLaVa also supports batched inference. Here is how you can do it: import requests from PIL import Image import torch -from transformers import AutoProcessor, LLavaForConditionalGeneration +from transformers import AutoProcessor, LlavaForConditionalGeneration # Load the model in half-precision -model = LLavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16, device_map="auto") +model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16, device_map="auto") processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") # Get two different images diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md index b9146fbd3347..88bd63e7101f 100644 --- a/docs/source/en/model_doc/llava_next.md +++ b/docs/source/en/model_doc/llava_next.md @@ -53,6 +53,12 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/ +> [!NOTE] +> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. +Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches. + + - Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use the processor's `apply_chat_template` to format your prompts correctly. For that you have to construct a conversation history, passing a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities. Below is an example of how to do that and the list of formats accepted by each checkpoint. We will use [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) and a conversation history of text and image. Each content field has to be a list of dicts, as follows: diff --git a/docs/source/en/model_doc/llava_next_video.md b/docs/source/en/model_doc/llava_next_video.md index fe905dfb7932..f8a149f12b67 100644 --- a/docs/source/en/model_doc/llava_next_video.md +++ b/docs/source/en/model_doc/llava_next_video.md @@ -50,6 +50,12 @@ The original code can be found [here](https://github.com/LLaVA-VL/LLaVA-NeXT/tre +> [!NOTE] +> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. +Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches. + + - Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use tokenizer's `apply_chat_template` to format your prompts correctly. Below is an example of how to do that. We will use [LLaVA-NeXT-Video-7B-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf) and a conversation history of videos and images. Each content field has to be a list of dicts, as follows: diff --git a/docs/source/en/model_doc/olmo_1124.md b/docs/source/en/model_doc/olmo_1124.md new file mode 100644 index 000000000000..f36ec438e57a --- /dev/null +++ b/docs/source/en/model_doc/olmo_1124.md @@ -0,0 +1,46 @@ + + +# OLMo November 2024 + +## Overview + +The OLMo November 2024 model is a successor of the OLMo model, which was proposed in +[OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838). + + The architectural changes from the original OLMo model to this model are: + +- RMSNorm is used instead of standard layer norm. +- Norm is applied to attention queries and keys. +- Norm is applied after attention/feedforward layers rather than before. + +This model was contributed by [shanearora](https://huggingface.co/shanearora). +The original code can be found [here](https://github.com/allenai/OLMo/tree/main/olmo). + + +## Olmo1124Config + +[[autodoc]] Olmo1124Config + +## Olmo1124Model + +[[autodoc]] Olmo1124Model + - forward + +## Olmo1124ForCausalLM + +[[autodoc]] Olmo1124ForCausalLM + - forward diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md index 1c4b5b4b874d..105307196eff 100644 --- a/docs/source/en/model_doc/video_llava.md +++ b/docs/source/en/model_doc/video_llava.md @@ -54,6 +54,12 @@ This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanT The original code can be found [here](https://github.com/PKU-YuanGroup/Video-LLaVA). +> [!NOTE] +> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. +Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches. + + ## Usage example ### Single Media Mode diff --git a/docs/source/en/model_doc/vipllava.md b/docs/source/en/model_doc/vipllava.md index b3e76cd292e4..328310f3e26b 100644 --- a/docs/source/en/model_doc/vipllava.md +++ b/docs/source/en/model_doc/vipllava.md @@ -39,6 +39,12 @@ This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) - Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results. +> [!NOTE] +> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. +Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches. + + - For better results, we recommend users to use the processor's `apply_chat_template()` method to format your prompt correctly. For that you need to construct a conversation history, passing in a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities, as follows: ```python diff --git a/docs/source/en/model_sharing.md b/docs/source/en/model_sharing.md index ec5802cfee37..076fc2ccdd57 100644 --- a/docs/source/en/model_sharing.md +++ b/docs/source/en/model_sharing.md @@ -43,7 +43,7 @@ As a result, you can load a specific model version with the `revision` parameter ```py >>> model = AutoModel.from_pretrained( -... "julien-c/EsperBERTo-small", revision="v2.0.1" # tag name, or branch name, or commit hash +... "julien-c/EsperBERTo-small", revision="4c77982" # tag name, or branch name, or commit hash ... ) ``` diff --git a/docs/source/en/perf_infer_gpu_multi.md b/docs/source/en/perf_infer_gpu_multi.md new file mode 100644 index 000000000000..997509441152 --- /dev/null +++ b/docs/source/en/perf_infer_gpu_multi.md @@ -0,0 +1,68 @@ + + +# Multi-GPU inference + +Built-in Tensor Parallelism (TP) is now available with certain models using PyTorch. Tensor parallelism shards a model onto multiple GPUs, enabling larger model sizes, and parallelizes computations such as matrix multiplication. + +To enable tensor parallel, pass the argument `tp_plan="auto"` to [`~AutoModelForCausalLM.from_pretrained`]: + +```python +import os +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" + +# Initialize distributed +rank = int(os.environ["RANK"]) +device = torch.device(f"cuda:{rank}") +torch.distributed.init_process_group("nccl", device_id=device) + +# Retrieve tensor parallel model +model = AutoModelForCausalLM.from_pretrained( + model_id, + tp_plan="auto", +) + +# Prepare input tokens +tokenizer = AutoTokenizer.from_pretrained(model_id) +prompt = "Can I help" +inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device) + +# Distributed run +outputs = model(inputs) +``` + +You can use `torchrun` to launch the above script with multiple processes, each mapping to a GPU: + +``` +torchrun --nproc-per-node 4 demo.py +``` + +PyTorch tensor parallel is currently supported for the following models: +* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) + +You can request to add tensor parallel support for another model by opening a GitHub Issue or Pull Request. + +### Expected speedups + +You can benefit from considerable speedups for inference, especially for inputs with large batch size or long sequences. + +For a single forward pass on [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) with a sequence length of 512 and various batch sizes, the expected speedup is as follows: + +
+ +
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 67bd31fdaeed..84109746f959 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -77,6 +77,7 @@ FlashAttention-2 is currently supported for the following architectures: * [Nemotron](https://huggingface.co/docs/transformers/model_doc/nemotron) * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb) * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel) +* [OLMo November 2024](https://huggingface.co/docs/transformers/model_doc/olmo_1124#transformers.Olmo1124Model) * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel) * [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel) * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration) @@ -260,6 +261,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel) * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb) * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel) +* [OLMo November 2024](https://huggingface.co/docs/transformers/model_doc/olmo_1124#transformers.Olmo1124Model) * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel) * [OPT](https://huggingface.co/docs/transformers/en/model_doc/opt) * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration) diff --git a/docs/source/en/performance.md b/docs/source/en/performance.md index 94e756cf33ad..b9176be04ec2 100644 --- a/docs/source/en/performance.md +++ b/docs/source/en/performance.md @@ -53,7 +53,7 @@ sections we go through the steps to run inference on CPU and single/multi-GPU se * [Inference on a single CPU](perf_infer_cpu) * [Inference on a single GPU](perf_infer_gpu_one) -* [Multi-GPU inference](perf_infer_gpu_one) +* [Multi-GPU inference](perf_infer_gpu_multi) * [XLA Integration for TensorFlow Models](tf_xla) diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md index ef8ed444d9d4..91c6ebd40dab 100644 --- a/docs/source/en/quantization/overview.md +++ b/docs/source/en/quantization/overview.md @@ -45,19 +45,19 @@ In short, supporting a wide range of quantization methods allows you to pick the Use the table below to help you decide which quantization method to use. -| Quantization method | On the fly quantization | CPU | CUDA GPU | RoCm GPU (AMD) | Metal (Apple Silicon) | torch.compile() support | Number of bits | Supports fine-tuning (through PEFT) | Serializable with 🤗 transformers | 🤗 transformers support | Link to library | -|-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------| -| [AQLM](./aqlm) | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🟢 | 1 / 2 | 🟢 | 🟢 | 🟢 | https://github.com/Vahe1994/AQLM | -| [AWQ](./awq) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | ? | 4 | 🟢 | 🟢 | 🟢 | https://github.com/casper-hansen/AutoAWQ | -| [bitsandbytes](./bitsandbytes) | 🟢 | 🟡 * | 🟢 | 🟡 * | 🔴 ** | 🔴 (soon!) | 4 / 8 | 🟢 | 🟢 | 🟢 | https://github.com/bitsandbytes-foundation/bitsandbytes | -| [compressed-tensors](./compressed_tensors) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 1 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/neuralmagic/compressed-tensors | -| [EETQ](./eetq) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | ? | 8 | 🟢 | 🟢 | 🟢 | https://github.com/NetEase-FuXi/EETQ | -| GGUF / GGML (llama.cpp) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 1 - 8 | 🔴 | [See GGUF section](../gguf) | [See GGUF section](../gguf) | https://github.com/ggerganov/llama.cpp | -| [GPTQ](./gptq) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 2 - 3 - 4 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/AutoGPTQ/AutoGPTQ | -| [HQQ](./hqq) | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🟢 | 1 - 8 | 🟢 | 🔴 | 🟢 | https://github.com/mobiusml/hqq/ | -| [Quanto](./quanto) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🟢 | 2 / 4 / 8 | 🔴 | 🔴 | 🟢 | https://github.com/huggingface/quanto | -| [FBGEMM_FP8](./fbgemm_fp8.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 8 | 🔴 | 🟢 | 🟢 | https://github.com/pytorch/FBGEMM | -| [torchao](./torchao.md) | 🟢 | | 🟢 | 🔴 | partial support (int4 weight only) | | 4 / 8 | | 🟢🔴 | 🟢 | https://github.com/pytorch/ao | +| Quantization method | On the fly quantization | CPU | CUDA GPU | RoCm GPU (AMD) | Metal (Apple Silicon) | Intel GPU | torch.compile() support | Number of bits | Supports fine-tuning (through PEFT) | Serializable with 🤗 transformers | 🤗 transformers support | Link to library | +|-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-----------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------| +| [AQLM](./aqlm) | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 1 / 2 | 🟢 | 🟢 | 🟢 | https://github.com/Vahe1994/AQLM | +| [AWQ](./awq) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | ? | 4 | 🟢 | 🟢 | 🟢 | https://github.com/casper-hansen/AutoAWQ | +| [bitsandbytes](./bitsandbytes) | 🟢 | 🟡 * | 🟢 | 🟡 * | 🔴 ** | 🟡 * | 🔴 (soon!) | 4 / 8 | 🟢 | 🟢 | 🟢 | https://github.com/bitsandbytes-foundation/bitsandbytes | +| [compressed-tensors](./compressed_tensors) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 1 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/neuralmagic/compressed-tensors | +| [EETQ](./eetq) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | ? | 8 | 🟢 | 🟢 | 🟢 | https://github.com/NetEase-FuXi/EETQ | +| GGUF / GGML (llama.cpp) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 1 - 8 | 🔴 | [See GGUF section](../gguf) | [See GGUF section](../gguf) | https://github.com/ggerganov/llama.cpp | +| [GPTQ](./gptq) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 2 - 3 - 4 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/AutoGPTQ/AutoGPTQ | +| [HQQ](./hqq) | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 1 - 8 | 🟢 | 🔴 | 🟢 | https://github.com/mobiusml/hqq/ | +| [Quanto](./quanto) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 🟢 | 2 / 4 / 8 | 🔴 | 🔴 | 🟢 | https://github.com/huggingface/quanto | +| [FBGEMM_FP8](./fbgemm_fp8.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🔴 | 8 | 🔴 | 🟢 | 🟢 | https://github.com/pytorch/FBGEMM | +| [torchao](./torchao.md) | 🟢 | | 🟢 | 🔴 | partial support (int4 weight only) | 🔴 | | 4 / 8 | | 🟢🔴 | 🟢 | https://github.com/pytorch/ao | diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md index 18135b2ec2fc..f5bba54a6e6b 100644 --- a/docs/source/en/quantization/quanto.md +++ b/docs/source/en/quantization/quanto.md @@ -28,7 +28,7 @@ Try Quanto + transformers with this [notebook](https://colab.research.google.com - weights quantization (`float8`,`int8`,`int4`,`int2`) - activation quantization (`float8`,`int8`) - modality agnostic (e.g CV,LLM) -- device agnostic (e.g CUDA,MPS,CPU) +- device agnostic (e.g CUDA,XPU,MPS,CPU) - compatibility with `torch.compile` - easy to add custom kernel for specific device - supports quantization aware training diff --git a/docs/source/en/tasks/idefics.md b/docs/source/en/tasks/idefics.md index a780124edea9..7e3335762ea4 100644 --- a/docs/source/en/tasks/idefics.md +++ b/docs/source/en/tasks/idefics.md @@ -386,9 +386,9 @@ The use and prompting for the conversational use is very similar to using the ba ```py >>> import torch >>> from transformers import IdeficsForVisionText2Text, AutoProcessor +>>> from accelerate.test_utils.testing import get_backend ->>> device = "cuda" if torch.cuda.is_available() else "cpu" - +>>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) >>> checkpoint = "HuggingFaceM4/idefics-9b-instruct" >>> model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16).to(device) >>> processor = AutoProcessor.from_pretrained(checkpoint) diff --git a/docs/source/en/tasks/image_captioning.md b/docs/source/en/tasks/image_captioning.md index 633ccc491ebb..9a78967cb519 100644 --- a/docs/source/en/tasks/image_captioning.md +++ b/docs/source/en/tasks/image_captioning.md @@ -256,8 +256,9 @@ image Prepare image for the model. ```python -device = "cuda" if torch.cuda.is_available() else "cpu" - +from accelerate.test_utils.testing import get_backend +# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) +device, _, _ = get_backend() inputs = processor(images=image, return_tensors="pt").to(device) pixel_values = inputs.pixel_values ``` diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md index 514ec3fbfe0b..49fdc9db60d4 100644 --- a/docs/source/en/tasks/image_classification.md +++ b/docs/source/en/tasks/image_classification.md @@ -26,7 +26,7 @@ after a natural disaster, monitoring crop health, or helping screen medical imag This guide illustrates how to: -1. Fine-tune [ViT](model_doc/vit) on the [Food-101](https://huggingface.co/datasets/food101) dataset to classify a food item in an image. +1. Fine-tune [ViT](../model_doc/vit) on the [Food-101](https://huggingface.co/datasets/food101) dataset to classify a food item in an image. 2. Use your fine-tuned model for inference. diff --git a/docs/source/en/tasks/image_feature_extraction.md b/docs/source/en/tasks/image_feature_extraction.md index c9d794b0b2be..80b701588b26 100644 --- a/docs/source/en/tasks/image_feature_extraction.md +++ b/docs/source/en/tasks/image_feature_extraction.md @@ -43,8 +43,9 @@ Let's see the pipeline in action. First, initialize the pipeline. If you don't p ```python import torch from transformers import pipeline - -DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +from accelerate.test_utils.testing import get_backend +# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) +DEVICE, _, _ = get_backend() pipe = pipeline(task="image-feature-extraction", model_name="google/vit-base-patch16-384", device=DEVICE, pool=True) ``` diff --git a/docs/source/en/tasks/image_to_image.md b/docs/source/en/tasks/image_to_image.md index 0bb74b36980e..f1c62e47aebf 100644 --- a/docs/source/en/tasks/image_to_image.md +++ b/docs/source/en/tasks/image_to_image.md @@ -37,8 +37,9 @@ We can now initialize the pipeline with a [Swin2SR model](https://huggingface.co ```python from transformers import pipeline import torch - -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +from accelerate.test_utils.testing import get_backend +# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) +device, _, _ = get_backend() pipe = pipeline(task="image-to-image", model="caidas/swin2SR-lightweight-x2-64", device=device) ``` diff --git a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md index 530e92d81f5c..17fb363df8e2 100644 --- a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md +++ b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md @@ -58,7 +58,7 @@ from transformers import TrainingArguments, Trainer import torch import torch.nn as nn import torch.nn.functional as F - +from accelerate.test_utils.testing import get_backend class ImageDistilTrainer(Trainer): def __init__(self, teacher_model=None, student_model=None, temperature=None, lambda_param=None, *args, **kwargs): @@ -66,7 +66,7 @@ class ImageDistilTrainer(Trainer): self.teacher = teacher_model self.student = student_model self.loss_function = nn.KLDivLoss(reduction="batchmean") - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) self.teacher.to(device) self.teacher.eval() self.temperature = temperature diff --git a/docs/source/en/tasks/mask_generation.md b/docs/source/en/tasks/mask_generation.md index 82202f58bca6..db16e035e303 100644 --- a/docs/source/en/tasks/mask_generation.md +++ b/docs/source/en/tasks/mask_generation.md @@ -125,9 +125,9 @@ the processor. ```python from transformers import SamModel, SamProcessor import torch - -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - +from accelerate.test_utils.testing import get_backend +# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) +device, _, _ = get_backend() model = SamModel.from_pretrained("facebook/sam-vit-base").to(device) processor = SamProcessor.from_pretrained("facebook/sam-vit-base") ``` diff --git a/docs/source/en/tasks/monocular_depth_estimation.md b/docs/source/en/tasks/monocular_depth_estimation.md index 3ded3179154a..edd22122f32b 100644 --- a/docs/source/en/tasks/monocular_depth_estimation.md +++ b/docs/source/en/tasks/monocular_depth_estimation.md @@ -53,8 +53,9 @@ Instantiate a pipeline from a [checkpoint on the Hugging Face Hub](https://huggi ```py >>> from transformers import pipeline >>> import torch - ->>> device = "cuda" if torch.cuda.is_available() else "cpu" +>>> from accelerate.test_utils.testing import get_backend +# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) +>>> device, _, _ = get_backend() >>> checkpoint = "depth-anything/Depth-Anything-V2-base-hf" >>> pipe = pipeline("depth-estimation", model=checkpoint, device=device) ``` diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md index fdc81896bc19..c307dd3334fe 100644 --- a/docs/source/en/tasks/object_detection.md +++ b/docs/source/en/tasks/object_detection.md @@ -1488,7 +1488,9 @@ Now that you have finetuned a model, evaluated it, and uploaded it to the Huggin Load model and image processor from the Hugging Face Hub (skip to use already trained in this session): ```py ->>> device = "cuda" +>>> from accelerate.test_utils.testing import get_backend +# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) +>>> device, _, _ = get_backend() >>> model_repo = "qubvel-hf/detr_finetuned_cppe5" >>> image_processor = AutoImageProcessor.from_pretrained(model_repo) diff --git a/docs/source/en/tasks/semantic_segmentation.md b/docs/source/en/tasks/semantic_segmentation.md index 912577589486..a21ff62edf1a 100644 --- a/docs/source/en/tasks/semantic_segmentation.md +++ b/docs/source/en/tasks/semantic_segmentation.md @@ -689,7 +689,9 @@ Reload the dataset and load an image for inference. We will now see how to infer without a pipeline. Process the image with an image processor and place the `pixel_values` on a GPU: ```py ->>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # use GPU if available, otherwise use a CPU +>>> from accelerate.test_utils.testing import get_backend +# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) +>>> device, _, _ = get_backend() >>> encoding = image_processor(image, return_tensors="pt") >>> pixel_values = encoding.pixel_values.to(device) ``` diff --git a/docs/source/en/tasks/text-to-speech.md b/docs/source/en/tasks/text-to-speech.md index 188d4ea5f9ee..e25da4e19efe 100644 --- a/docs/source/en/tasks/text-to-speech.md +++ b/docs/source/en/tasks/text-to-speech.md @@ -282,10 +282,10 @@ containing the corresponding speaker embedding. >>> import os >>> import torch >>> from speechbrain.inference.classifiers import EncoderClassifier +>>> from accelerate.test_utils.testing import get_backend >>> spk_model_name = "speechbrain/spkrec-xvect-voxceleb" - ->>> device = "cuda" if torch.cuda.is_available() else "cpu" +>>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) >>> speaker_model = EncoderClassifier.from_hparams( ... source=spk_model_name, ... run_opts={"device": device}, diff --git a/docs/source/en/tasks/visual_question_answering.md b/docs/source/en/tasks/visual_question_answering.md index 7083d8c98b93..87dbfb751bfa 100644 --- a/docs/source/en/tasks/visual_question_answering.md +++ b/docs/source/en/tasks/visual_question_answering.md @@ -363,10 +363,11 @@ GPU, if available, which we didn't need to do earlier when training, as [`Traine ```py >>> from transformers import AutoProcessor, Blip2ForConditionalGeneration >>> import torch +>>> from accelerate.test_utils.testing import get_backend >>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b") >>> model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16) ->>> device = "cuda" if torch.cuda.is_available() else "cpu" +>>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) >>> model.to(device) ``` diff --git a/docs/source/en/tasks_explained.md b/docs/source/en/tasks_explained.md index 7c836f70cfc4..1cc60ba096f0 100644 --- a/docs/source/en/tasks_explained.md +++ b/docs/source/en/tasks_explained.md @@ -182,7 +182,7 @@ There are three main components to Mask2Former: The mask predictions are generated by combining the pixel-embeddings with the final decoder hidden states. The sigmoid cross-entropy and dice loss is calculated between the logits and the ground truth mask to find the most likely mask. -Ready to try your hand at object detection? Check out our complete [image segmentation guide](tasks/semantic_segmentation) to learn how to finetune SegFormer and use it for inference! +Ready to try your hand at image segmentation? Check out our complete [image segmentation guide](tasks/semantic_segmentation) to learn how to finetune SegFormer and use it for inference! ### Depth estimation @@ -292,4 +292,4 @@ Ready to try your hand at translation? Check out our complete [translation guide For more information about text generation, check out the [text generation strategies](generation_strategies) guide! - \ No newline at end of file + diff --git a/docs/source/en/testing.md b/docs/source/en/testing.md index 1da8a62456ee..9e85f2248e16 100644 --- a/docs/source/en/testing.md +++ b/docs/source/en/testing.md @@ -428,7 +428,7 @@ pytest --instafail ### To GPU or not to GPU -On a GPU-enabled setup, to test in CPU-only mode add `CUDA_VISIBLE_DEVICES=""`: +On a GPU-enabled setup, to test in CPU-only mode add `CUDA_VISIBLE_DEVICES=""` for CUDA GPUs: ```bash CUDA_VISIBLE_DEVICES="" pytest tests/utils/test_logging.py @@ -441,10 +441,12 @@ second gpu if you have gpus `0` and `1`, you can run: CUDA_VISIBLE_DEVICES="1" pytest tests/utils/test_logging.py ``` +For Intel GPUs, use `ZE_AFFINITY_MASK` instead of `CUDA_VISIBLE_DEVICES` in the above example. + This is handy when you want to run different tasks on different GPUs. Some tests must be run on CPU-only, others on either CPU or GPU or TPU, yet others on multiple-GPUs. The following skip -decorators are used to set the requirements of tests CPU/GPU/TPU-wise: +decorators are used to set the requirements of tests CPU/GPU/XPU/TPU-wise: - `require_torch` - this test will run only under torch - `require_torch_gpu` - as `require_torch` plus requires at least 1 GPU diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md index 7bee34728927..e3a66f420424 100644 --- a/docs/source/en/trainer.md +++ b/docs/source/en/trainer.md @@ -174,7 +174,7 @@ trainer = Trainer( processing_class=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, - callback=[EarlyStoppingCallback()], + callbacks=[EarlyStoppingCallback()], ) ``` diff --git a/docs/source/en/training.md b/docs/source/en/training.md index aacf174fbd6b..6fcf6809204b 100644 --- a/docs/source/en/training.md +++ b/docs/source/en/training.md @@ -287,9 +287,10 @@ model.fit(tf_dataset) At this point, you may need to restart your notebook or execute the following code to free some memory: ```py +from accelerate.utils.memory import clear_device_cache del model del trainer -torch.cuda.empty_cache() +clear_device_cache() ``` Next, manually postprocess `tokenized_dataset` to prepare it for training. @@ -364,8 +365,9 @@ Lastly, specify `device` to use a GPU if you have access to one. Otherwise, trai ```py >>> import torch +>>> from accelerate.test_utils.testing import get_backend ->>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +>>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) >>> model.to(device) ``` diff --git a/docs/source/es/model_sharing.md b/docs/source/es/model_sharing.md index 43cf0b8eddb8..77ee523094f4 100644 --- a/docs/source/es/model_sharing.md +++ b/docs/source/es/model_sharing.md @@ -43,7 +43,7 @@ Como resultado, puedes cargar una versión específica del modelo con el paráme ```py >>> model = AutoModel.from_pretrained( -... "julien-c/EsperBERTo-small", revision="v2.0.1" # tag name, or branch name, or commit hash +... "julien-c/EsperBERTo-small", revision="4c77982" # tag name, or branch name, or commit hash ... ) ``` diff --git a/docs/source/it/model_sharing.md b/docs/source/it/model_sharing.md index 81257717ed9a..6505658616ba 100644 --- a/docs/source/it/model_sharing.md +++ b/docs/source/it/model_sharing.md @@ -43,7 +43,7 @@ Come risultato, puoi caricare una specifica versione di un modello con il parame ```py >>> model = AutoModel.from_pretrained( -... "julien-c/EsperBERTo-small", revision="v2.0.1" # nome di un tag, di un branch, o commit hash +... "julien-c/EsperBERTo-small", revision="4c77982" # nome di un tag, di un branch, o commit hash ... ) ``` diff --git a/docs/source/ja/model_sharing.md b/docs/source/ja/model_sharing.md index aa8f7a3d1e33..16d47057052b 100644 --- a/docs/source/ja/model_sharing.md +++ b/docs/source/ja/model_sharing.md @@ -43,7 +43,7 @@ Model Hubの組み込みバージョニングはgitおよび[git-lfs](https://gi ```py >>> model = AutoModel.from_pretrained( -... "julien-c/EsperBERTo-small", revision="v2.0.1" # タグ名、またはブランチ名、またはコミットハッシュ +... "julien-c/EsperBERTo-small", revision="4c77982" # タグ名、またはブランチ名、またはコミットハッシュ ... ) ``` diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index 0cafd918af54..e924ebe34d1c 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -448,8 +448,8 @@ title: Mamba - local: model_doc/mamba2 title: Mamba2 - - local: in_translation - title: (번역중) MarianMT + - local: model_doc/marian + title: MarianMT - local: in_translation title: (번역중) MarkupLM - local: in_translation diff --git a/docs/source/ko/model_doc/marian.md b/docs/source/ko/model_doc/marian.md new file mode 100644 index 000000000000..79a9641401d0 --- /dev/null +++ b/docs/source/ko/model_doc/marian.md @@ -0,0 +1,217 @@ + + +# MarianMT[[MarianMT]] + +
+ +Models + + +Spaces + +
+ +## 개요[[Overview]] + +BART와 동일한 모델을 사용하는 번역 모델 프레임워크입니다. 번역 결과는 각 모델 카드의 테스트 세트와 유사하지만, 정확히 일치하지는 않을 수 있습니다. 이 모델은 [sshleifer](https://huggingface.co/sshleifer)가 제공했습니다. + + +## 구현 노트[[Implementation Notes]] + +- 각 모델은 약 298 MB를 차지하며, 1,000개 이상의 모델이 제공됩니다. +- 지원되는 언어 쌍 목록은 [여기](https://huggingface.co/Helsinki-NLP)에서 확인할 수 있습니다. +- 모델들은 [Jörg Tiedemann](https://researchportal.helsinki.fi/en/persons/j%C3%B6rg-tiedemann)에 의해 [Marian](https://marian-nmt.github.io/) C++ 라이브러리를 이용하여 학습되었습니다. 이 라이브러리는 빠른 학습과 번역을 지원합니다. +- 모든 모델은 6개 레이어로 이루어진 Transformer 기반의 인코더-디코더 구조입니다. 각 모델의 성능은 모델 카드에 기입되어 있습니다. +- BPE 전처리가 필요한 80개의 OPUS 모델은 지원되지 않습니다. +- 모델링 코드는 [`BartForConditionalGeneration`]을 기반으로 하며, 일부 수정사항이 반영되어 있습니다: + + - 정적 (사인 함수 기반) 위치 임베딩 사용 (`MarianConfig.static_position_embeddings=True`) + - 임베딩 레이어 정규화 생략 (`MarianConfig.normalize_embedding=False`) + - 모델은 생성 시 프리픽스로 `pad_token_id` (해당 토큰 임베딩 값은 0)를 사용하여 시작합니다 (Bart는 + ``를 사용), +- Marian 모델을 PyTorch로 대량 변환하는 코드는 `convert_marian_to_pytorch.py`에서 찾을 수 있습니다. + + +## 모델 이름 규칙[[Naming]] + +- 모든 모델 이름은 `Helsinki-NLP/opus-mt-{src}-{tgt}` 형식을 따릅니다. +- 모델의 언어 코드 표기는 일관되지 않습니다. 두 자리 코드는 일반적으로 [여기](https://developers.google.com/admin-sdk/directory/v1/languages)에서 찾을 수 있으며, 세 자리 코드는 "언어 코드 {code}"로 구글 검색을 통해 찾습니다. +- `es_AR`과 같은 형태의 코드는 `code_{region}` 형식을 의미합니다. 여기서의 예시는 아르헨티나의 스페인어를 의미합니다. +- 모델 변환은 두 단계로 이루어졌습니다. 처음 1,000개 모델은 ISO-639-2 코드를 사용하고, 두 번째 그룹은 ISO-639-5와 ISO-639-2 코드를 조합하여 언어를 식별합니다. + + +## 예시[[Examples]] + +- Marian 모델은 라이브러리의 다른 번역 모델들보다 크기가 작아 파인튜닝 실험과 통합 테스트에 유용합니다. +- [GPU에서 파인튜닝하기](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/train_distil_marian_enro.sh) + +## 다국어 모델 사용법[[Multilingual Models]] + +- 모든 모델 이름은`Helsinki-NLP/opus-mt-{src}-{tgt}` 형식을 따릅니다. +- 다중 언어 출력을 지원하는 모델의 경우, 출력을 원하는 언어의 언어 코드를 `src_text`의 시작 부분에 추가하여 지정해야 합니다. +- 모델 카드에서 지원되는 언어 코드의 목록을 확인할 수 있습니다! 예를 들어 [opus-mt-en-roa](https://huggingface.co/Helsinki-NLP/opus-mt-en-roa)에서 확인할 수 있습니다. +- `Helsinki-NLP/opus-mt-roa-en`처럼 소스 측에서만 다국어를 지원하는 모델의 경우, 별도의 언어 코드 지정이 필요하지 않습니다. + +[Tatoeba-Challenge 리포지토리](https://github.com/Helsinki-NLP/Tatoeba-Challenge)의 새로운 다국적 모델은 3자리 언어 코드를 사용합니다: + + +```python +>>> from transformers import MarianMTModel, MarianTokenizer + +>>> src_text = [ +... ">>fra<< this is a sentence in english that we want to translate to french", +... ">>por<< This should go to portuguese", +... ">>esp<< And this to Spanish", +... ] + +>>> model_name = "Helsinki-NLP/opus-mt-en-roa" +>>> tokenizer = MarianTokenizer.from_pretrained(model_name) +>>> print(tokenizer.supported_language_codes) +['>>zlm_Latn<<', '>>mfe<<', '>>hat<<', '>>pap<<', '>>ast<<', '>>cat<<', '>>ind<<', '>>glg<<', '>>wln<<', '>>spa<<', '>>fra<<', '>>ron<<', '>>por<<', '>>ita<<', '>>oci<<', '>>arg<<', '>>min<<'] + +>>> model = MarianMTModel.from_pretrained(model_name) +>>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True)) +>>> [tokenizer.decode(t, skip_special_tokens=True) for t in translated] +["c'est une phrase en anglais que nous voulons traduire en français", + 'Isto deve ir para o português.', + 'Y esto al español'] +``` + +허브에 있는 모든 사전 학습된 모델을 확인하는 코드입니다: + +```python +from huggingface_hub import list_models + +model_list = list_models() +org = "Helsinki-NLP" +model_ids = [x.id for x in model_list if x.id.startswith(org)] +suffix = [x.split("/")[1] for x in model_ids] +old_style_multi_models = [f"{org}/{s}" for s in suffix if s != s.lower()] +``` + +## 구형 다국어 모델[[Old Style Multi-Lingual Models]] + +이 모델들은 OPUS-MT-Train 리포지토리의 구형 다국어 모델들입니다. 각 언어 그룹에 포함된 언어들은 다음과 같습니다: + +```python no-style +['Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU', + 'Helsinki-NLP/opus-mt-ROMANCE-en', + 'Helsinki-NLP/opus-mt-SCANDINAVIA-SCANDINAVIA', + 'Helsinki-NLP/opus-mt-de-ZH', + 'Helsinki-NLP/opus-mt-en-CELTIC', + 'Helsinki-NLP/opus-mt-en-ROMANCE', + 'Helsinki-NLP/opus-mt-es-NORWAY', + 'Helsinki-NLP/opus-mt-fi-NORWAY', + 'Helsinki-NLP/opus-mt-fi-ZH', + 'Helsinki-NLP/opus-mt-fi_nb_no_nn_ru_sv_en-SAMI', + 'Helsinki-NLP/opus-mt-sv-NORWAY', + 'Helsinki-NLP/opus-mt-sv-ZH'] +GROUP_MEMBERS = { + 'ZH': ['cmn', 'cn', 'yue', 'ze_zh', 'zh_cn', 'zh_CN', 'zh_HK', 'zh_tw', 'zh_TW', 'zh_yue', 'zhs', 'zht', 'zh'], + 'ROMANCE': ['fr', 'fr_BE', 'fr_CA', 'fr_FR', 'wa', 'frp', 'oc', 'ca', 'rm', 'lld', 'fur', 'lij', 'lmo', 'es', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'es_PA', 'es_PE', 'es_PR', 'es_SV', 'es_UY', 'es_VE', 'pt', 'pt_br', 'pt_BR', 'pt_PT', 'gl', 'lad', 'an', 'mwl', 'it', 'it_IT', 'co', 'nap', 'scn', 'vec', 'sc', 'ro', 'la'], + 'NORTH_EU': ['de', 'nl', 'fy', 'af', 'da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'], + 'SCANDINAVIA': ['da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'], + 'SAMI': ['se', 'sma', 'smj', 'smn', 'sms'], + 'NORWAY': ['nb_NO', 'nb', 'nn_NO', 'nn', 'nog', 'no_nb', 'no'], + 'CELTIC': ['ga', 'cy', 'br', 'gd', 'kw', 'gv'] +} +``` + +영어를 여러 로망스 언어로 번역하는 예제입니다. 여기서는 구형 2자리 언어 코드를 사용합니다: + + +```python +>>> from transformers import MarianMTModel, MarianTokenizer + +>>> src_text = [ +... ">>fr<< this is a sentence in english that we want to translate to french", +... ">>pt<< This should go to portuguese", +... ">>es<< And this to Spanish", +... ] + +>>> model_name = "Helsinki-NLP/opus-mt-en-ROMANCE" +>>> tokenizer = MarianTokenizer.from_pretrained(model_name) + +>>> model = MarianMTModel.from_pretrained(model_name) +>>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True)) +>>> tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated] +["c'est une phrase en anglais que nous voulons traduire en français", + 'Isto deve ir para o português.', + 'Y esto al español'] +``` + +## 자료[[Resources]] + +- [번역 작업 가이드](../tasks/translation) +- [요약 작업 가이드](../tasks/summarization) +- [언어 모델링 작업 가이드](../tasks/language_modeling) + +## MarianConfig + +[[autodoc]] MarianConfig + +## MarianTokenizer + +[[autodoc]] MarianTokenizer + - build_inputs_with_special_tokens + + + + +## MarianModel + +[[autodoc]] MarianModel + - forward + +## MarianMTModel + +[[autodoc]] MarianMTModel + - forward + +## MarianForCausalLM + +[[autodoc]] MarianForCausalLM + - forward + + + + +## TFMarianModel + +[[autodoc]] TFMarianModel + - call + +## TFMarianMTModel + +[[autodoc]] TFMarianMTModel + - call + + + + +## FlaxMarianModel + +[[autodoc]] FlaxMarianModel + - __call__ + +## FlaxMarianMTModel + +[[autodoc]] FlaxMarianMTModel + - __call__ + + + diff --git a/docs/source/ko/model_sharing.md b/docs/source/ko/model_sharing.md index 868cc3b231de..381150779662 100644 --- a/docs/source/ko/model_sharing.md +++ b/docs/source/ko/model_sharing.md @@ -43,7 +43,7 @@ picture-in-picture" allowfullscreen> ```py >>> model = AutoModel.from_pretrained( -... "julien-c/EsperBERTo-small", revision="v2.0.1" # tag name, or branch name, or commit hash +... "julien-c/EsperBERTo-small", revision="4c77982" # tag name, or branch name, or commit hash ... ) ``` diff --git a/docs/source/zh/model_sharing.md b/docs/source/zh/model_sharing.md index e28a000c1153..35e317bcac36 100644 --- a/docs/source/zh/model_sharing.md +++ b/docs/source/zh/model_sharing.md @@ -43,7 +43,7 @@ Model Hub的内置版本控制基于git和[git-lfs](https://git-lfs.github.com/) ```py >>> model = AutoModel.from_pretrained( -... "julien-c/EsperBERTo-small", revision="v2.0.1" # tag name, or branch name, or commit hash +... "julien-c/EsperBERTo-small", revision="4c77982" # tag name, or branch name, or commit hash ... ) ``` diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index 2c60b359bd10..a7193e23dddc 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -331,7 +331,7 @@ def main(): config = AutoConfig.from_pretrained( args.model_name_or_path, num_labels=len(labels), - i2label=id2label, + id2label=id2label, label2id=label2id, finetuning_task="image-classification", trust_remote_code=args.trust_remote_code, diff --git a/setup.py b/setup.py index cbfcfd434285..922258d65efa 100644 --- a/setup.py +++ b/setup.py @@ -117,7 +117,7 @@ "fugashi>=1.0", "GitPython<3.1.19", "hf-doc-builder>=0.3.0", - "huggingface-hub>=0.23.2,<1.0", + "huggingface-hub>=0.24.0,<1.0", "importlib_metadata", "ipadic>=1.0.0,<2.0", "isort>=5.5.4", diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 47b43e0b9089..36cc4449aec4 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -620,6 +620,7 @@ "models.nougat": ["NougatProcessor"], "models.nystromformer": ["NystromformerConfig"], "models.olmo": ["OlmoConfig"], + "models.olmo_1124": ["Olmo1124Config"], "models.olmoe": ["OlmoeConfig"], "models.omdet_turbo": [ "OmDetTurboConfig", @@ -2919,6 +2920,13 @@ "OlmoPreTrainedModel", ] ) + _import_structure["models.olmo_1124"].extend( + [ + "Olmo1124ForCausalLM", + "Olmo1124Model", + "Olmo1124PreTrainedModel", + ] + ) _import_structure["models.olmoe"].extend( [ "OlmoeForCausalLM", @@ -5506,6 +5514,7 @@ NystromformerConfig, ) from .models.olmo import OlmoConfig + from .models.olmo_1124 import Olmo1124Config from .models.olmoe import OlmoeConfig from .models.omdet_turbo import ( OmDetTurboConfig, @@ -7523,6 +7532,11 @@ OlmoModel, OlmoPreTrainedModel, ) + from .models.olmo_1124 import ( + Olmo1124ForCausalLM, + Olmo1124Model, + Olmo1124PreTrainedModel, + ) from .models.olmoe import ( OlmoeForCausalLM, OlmoeModel, diff --git a/src/transformers/agents/agents.py b/src/transformers/agents/agents.py index 73b7186d25a3..c461c50f2959 100644 --- a/src/transformers/agents/agents.py +++ b/src/transformers/agents/agents.py @@ -1141,11 +1141,10 @@ def step(self): ) self.logger.warning("Print outputs:") self.logger.log(32, self.state["print_outputs"]) + observation = "Print outputs:\n" + self.state["print_outputs"] if result is not None: self.logger.warning("Last output from code snippet:") self.logger.log(32, str(result)) - observation = "Print outputs:\n" + self.state["print_outputs"] - if result is not None: observation += "Last output from code snippet:\n" + str(result)[:100000] current_step_logs["observation"] = observation except Exception as e: diff --git a/src/transformers/agents/monitoring.py b/src/transformers/agents/monitoring.py index 8e28a72deb2a..755418d35a56 100644 --- a/src/transformers/agents/monitoring.py +++ b/src/transformers/agents/monitoring.py @@ -18,11 +18,19 @@ from .agents import ReactAgent -def pull_message(step_log: dict): +def pull_message(step_log: dict, test_mode: bool = True): try: from gradio import ChatMessage except ImportError: - raise ImportError("Gradio should be installed in order to launch a gradio demo.") + if test_mode: + + class ChatMessage: + def __init__(self, role, content, metadata=None): + self.role = role + self.content = content + self.metadata = metadata + else: + raise ImportError("Gradio should be installed in order to launch a gradio demo.") if step_log.get("rationale"): yield ChatMessage(role="assistant", content=step_log["rationale"]) @@ -46,30 +54,40 @@ def pull_message(step_log: dict): ) -def stream_to_gradio(agent: ReactAgent, task: str, **kwargs): +def stream_to_gradio(agent: ReactAgent, task: str, test_mode: bool = False, **kwargs): """Runs an agent with the given task and streams the messages from the agent as gradio ChatMessages.""" try: from gradio import ChatMessage except ImportError: - raise ImportError("Gradio should be installed in order to launch a gradio demo.") + if test_mode: + + class ChatMessage: + def __init__(self, role, content, metadata=None): + self.role = role + self.content = content + self.metadata = metadata + else: + raise ImportError("Gradio should be installed in order to launch a gradio demo.") for step_log in agent.run(task, stream=True, **kwargs): if isinstance(step_log, dict): - for message in pull_message(step_log): + for message in pull_message(step_log, test_mode=test_mode): yield message - if isinstance(step_log, AgentText): - yield ChatMessage(role="assistant", content=f"**Final answer:**\n```\n{step_log.to_string()}\n```") - elif isinstance(step_log, AgentImage): + final_answer = step_log # Last log is the run's final_answer + + if isinstance(final_answer, AgentText): + yield ChatMessage(role="assistant", content=f"**Final answer:**\n```\n{final_answer.to_string()}\n```") + elif isinstance(final_answer, AgentImage): yield ChatMessage( role="assistant", - content={"path": step_log.to_string(), "mime_type": "image/png"}, + content={"path": final_answer.to_string(), "mime_type": "image/png"}, ) - elif isinstance(step_log, AgentAudio): + elif isinstance(final_answer, AgentAudio): yield ChatMessage( role="assistant", - content={"path": step_log.to_string(), "mime_type": "audio/wav"}, + content={"path": final_answer.to_string(), "mime_type": "audio/wav"}, ) else: - yield ChatMessage(role="assistant", content=str(step_log)) + yield ChatMessage(role="assistant", content=str(final_answer)) diff --git a/src/transformers/agents/python_interpreter.py b/src/transformers/agents/python_interpreter.py index fbece2bebd35..6e90f356cb92 100644 --- a/src/transformers/agents/python_interpreter.py +++ b/src/transformers/agents/python_interpreter.py @@ -848,6 +848,13 @@ def evaluate_ast( raise InterpreterError(f"{expression.__class__.__name__} is not supported.") +def truncate_print_outputs(print_outputs: str, max_len_outputs: int = MAX_LEN_OUTPUT) -> str: + if len(print_outputs) < max_len_outputs: + return print_outputs + else: + return f"Print outputs:\n{print_outputs[:max_len_outputs]}\n_Print outputs have been truncated over the limit of {max_len_outputs} characters._\n" + + def evaluate_python_code( code: str, static_tools: Optional[Dict[str, Callable]] = None, @@ -890,25 +897,12 @@ def evaluate_python_code( PRINT_OUTPUTS = "" global OPERATIONS_COUNT OPERATIONS_COUNT = 0 - for node in expression.body: - try: + try: + for node in expression.body: result = evaluate_ast(node, state, static_tools, custom_tools, authorized_imports) - except InterpreterError as e: - msg = "" - if len(PRINT_OUTPUTS) > 0: - if len(PRINT_OUTPUTS) < MAX_LEN_OUTPUT: - msg += f"Print outputs:\n{PRINT_OUTPUTS}\n====\n" - else: - msg += f"Print outputs:\n{PRINT_OUTPUTS[:MAX_LEN_OUTPUT]}\n_Print outputs were over {MAX_LEN_OUTPUT} characters, so they have been truncated._\n====\n" - msg += f"EXECUTION FAILED:\nEvaluation stopped at line '{ast.get_source_segment(code, node)}' because of the following error:\n{e}" - raise InterpreterError(msg) - finally: - if len(PRINT_OUTPUTS) < MAX_LEN_OUTPUT: - state["print_outputs"] = PRINT_OUTPUTS - else: - state["print_outputs"] = ( - PRINT_OUTPUTS[:MAX_LEN_OUTPUT] - + f"\n_Print outputs were over {MAX_LEN_OUTPUT} characters, so they have been truncated._" - ) - - return result + state["print_outputs"] = truncate_print_outputs(PRINT_OUTPUTS, max_len_outputs=MAX_LEN_OUTPUT) + return result + except InterpreterError as e: + msg = truncate_print_outputs(PRINT_OUTPUTS, max_len_outputs=MAX_LEN_OUTPUT) + msg += f"EXECUTION FAILED:\nEvaluation stopped at line '{ast.get_source_segment(code, node)}' because of the following error:\n{e}" + raise InterpreterError(msg) diff --git a/src/transformers/agents/tools.py b/src/transformers/agents/tools.py index 994e1bdd817b..6d3401bf30e9 100644 --- a/src/transformers/agents/tools.py +++ b/src/transformers/agents/tools.py @@ -14,6 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import ast import base64 import importlib import inspect @@ -22,6 +23,7 @@ import os import tempfile from functools import lru_cache, wraps +from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Union from huggingface_hub import create_repo, get_collection, hf_hub_download, metadata_update, upload_folder @@ -44,7 +46,7 @@ is_vision_available, logging, ) -from .agent_types import handle_agent_inputs, handle_agent_outputs +from .agent_types import ImageType, handle_agent_inputs, handle_agent_outputs logger = logging.get_logger(__name__) @@ -141,15 +143,19 @@ def validate_arguments(self, do_validate_forward: bool = True): required_attributes = { "description": str, "name": str, - "inputs": Dict, + "inputs": dict, "output_type": str, } authorized_types = ["string", "integer", "number", "image", "audio", "any", "boolean"] for attr, expected_type in required_attributes.items(): attr_value = getattr(self, attr, None) + if attr_value is None: + raise TypeError(f"You must set an attribute {attr}.") if not isinstance(attr_value, expected_type): - raise TypeError(f"You must set an attribute {attr} of type {expected_type.__name__}.") + raise TypeError( + f"Attribute {attr} should have type {expected_type.__name__}, got {type(attr_value)} instead." + ) for input_name, input_content in self.inputs.items(): assert isinstance(input_content, dict), f"Input '{input_name}' should be a dictionary." assert ( @@ -248,7 +254,6 @@ def save(self, output_dir): def from_hub( cls, repo_id: str, - model_repo_id: Optional[str] = None, token: Optional[str] = None, **kwargs, ): @@ -266,9 +271,6 @@ def from_hub( Args: repo_id (`str`): The name of the repo on the Hub where your tool is defined. - model_repo_id (`str`, *optional*): - If your tool uses a model and you want to use a different model than the default, you can pass a second - repo ID or an endpoint url to this argument. token (`str`, *optional*): The token to identify you on hf.co. If unset, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). @@ -354,6 +356,9 @@ def from_hub( if tool_class.output_type != custom_tool["output_type"]: tool_class.output_type = custom_tool["output_type"] + if not isinstance(tool_class.inputs, dict): + tool_class.inputs = ast.literal_eval(tool_class.inputs) + return tool_class(**kwargs) def push_to_hub( @@ -414,7 +419,9 @@ def push_to_hub( ) @staticmethod - def from_space(space_id, name, description): + def from_space( + space_id: str, name: str, description: str, api_name: Optional[str] = None, token: Optional[str] = None + ): """ Creates a [`Tool`] from a Space given its id on the Hub. @@ -425,34 +432,73 @@ def from_space(space_id, name, description): The name of the tool. description (`str`): The description of the tool. - + api_name (`str`, *optional*): + The specific api_name to use, if the space has several tabs. If not precised, will default to the first available api. + token (`str`, *optional*): + Add your token to access private spaces or increase your GPU quotas. Returns: [`Tool`]: - The created tool. + The Space, as a tool. - Example: + Examples: + ``` + image_generator = Tool.from_space( + space_id="black-forest-labs/FLUX.1-schnell", + name="image-generator", + description="Generate an image from a prompt" + ) + image = image_generator("Generate an image of a cool surfer in Tahiti") ``` - tool = Tool.from_space("black-forest-labs/FLUX.1-schnell", "image-generator", "Generate an image from a prompt") + ``` + face_swapper = Tool.from_space( + "tuan2308/face-swap", + "face_swapper", + "Tool that puts the face shown on the first image on the second image. You can give it paths to images.", + ) + image = face_swapper('./aymeric.jpeg', './ruth.jpg') ``` """ - from gradio_client import Client + from gradio_client import Client, handle_file + from gradio_client.utils import is_http_url_like class SpaceToolWrapper(Tool): - def __init__(self, space_id, name, description): - self.client = Client(space_id) + def __init__( + self, + space_id: str, + name: str, + description: str, + api_name: Optional[str] = None, + token: Optional[str] = None, + ): + self.client = Client(space_id, hf_token=token) self.name = name self.description = description - space_description = self.client.view_api(return_format="dict")["named_endpoints"] - route = list(space_description.keys())[0] - space_description_route = space_description[route] + space_description = self.client.view_api(return_format="dict", print_info=False)["named_endpoints"] + + # If api_name is not defined, take the first of the available APIs for this space + if api_name is None: + api_name = list(space_description.keys())[0] + logger.warning( + f"Since `api_name` was not defined, it was automatically set to the first avilable API: `{api_name}`." + ) + self.api_name = api_name + + try: + space_description_api = space_description[api_name] + except KeyError: + raise KeyError(f"Could not find specified {api_name=} among available api names.") + self.inputs = {} - for parameter in space_description_route["parameters"]: + for parameter in space_description_api["parameters"]: if not parameter["parameter_has_default"]: + parameter_type = parameter["type"]["type"] + if parameter_type == "object": + parameter_type = "any" self.inputs[parameter["parameter_name"]] = { - "type": parameter["type"]["type"], + "type": parameter_type, "description": parameter["python_type"]["description"], } - output_component = space_description_route["returns"][0]["component"] + output_component = space_description_api["returns"][0]["component"] if output_component == "Image": self.output_type = "image" elif output_component == "Audio": @@ -460,10 +506,33 @@ def __init__(self, space_id, name, description): else: self.output_type = "any" - def forward(self, *args, **kwargs): - return self.client.predict(*args, **kwargs)[0] # Usually the first output is the result + def sanitize_argument_for_prediction(self, arg): + if isinstance(arg, ImageType): + temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False) + arg.save(temp_file.name) + arg = temp_file.name + if (isinstance(arg, (str, Path)) and Path(arg).exists() and Path(arg).is_file()) or is_http_url_like( + arg + ): + arg = handle_file(arg) + return arg - return SpaceToolWrapper(space_id, name, description) + def forward(self, *args, **kwargs): + # Preprocess args and kwargs: + args = list(args) + for i, arg in enumerate(args): + args[i] = self.sanitize_argument_for_prediction(arg) + for arg_name, arg in kwargs.items(): + kwargs[arg_name] = self.sanitize_argument_for_prediction(arg) + + output = self.client.predict(*args, api_name=self.api_name, **kwargs) + if isinstance(output, tuple) or isinstance(output, list): + return output[ + 0 + ] # Sometime the space also returns the generation seed, in which case the result is at index 0 + return output + + return SpaceToolWrapper(space_id, name, description, api_name=api_name, token=token) @staticmethod def from_gradio(gradio_tool): diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 60f9f34cf861..e49eab86b4e1 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -71,6 +71,8 @@ class PretrainedConfig(PushToHubMixin): outputs of the model during inference. - **attribute_map** (`Dict[str, str]`) -- A dict that maps model specific attribute names to the standardized naming of attributes. + - **base_model_tp_plan** (`Dict[str, Any]`) -- A dict that maps sub-modules FQNs of a base model to a tensor + parallel plan applied to the sub-module when `model.tensor_parallel` is called. Common attributes (present in all subclasses): @@ -194,6 +196,7 @@ class PretrainedConfig(PushToHubMixin): sub_configs: Dict[str, "PretrainedConfig"] = {} is_composition: bool = False attribute_map: Dict[str, str] = {} + base_model_tp_plan: Optional[Dict[str, Any]] = None _auto_class: Optional[str] = None def __setattr__(self, key, value): @@ -848,6 +851,9 @@ def to_diff_dict(self) -> Dict[str, Any]: if "_attn_implementation_internal" in serializable_config_dict: del serializable_config_dict["_attn_implementation_internal"] + # Do not serialize `base_model_tp_plan` for now + if "base_model_tp_plan" in serializable_config_dict: + del serializable_config_dict["base_model_tp_plan"] return serializable_config_dict @@ -867,6 +873,9 @@ def to_dict(self) -> Dict[str, Any]: del output["_commit_hash"] if "_attn_implementation_internal" in output: del output["_attn_implementation_internal"] + # Do not serialize `base_model_tp_plan` for now + if "base_model_tp_plan" in output: + del output["base_model_tp_plan"] # Transformers version when serializing the model output["transformers_version"] = __version__ diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index a633f54a4af1..9543b58ad40d 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -24,7 +24,7 @@ "fugashi": "fugashi>=1.0", "GitPython": "GitPython<3.1.19", "hf-doc-builder": "hf-doc-builder>=0.3.0", - "huggingface-hub": "huggingface-hub>=0.23.2,<1.0", + "huggingface-hub": "huggingface-hub>=0.24.0,<1.0", "importlib_metadata": "importlib_metadata", "ipadic": "ipadic>=1.0.0,<2.0", "isort": "isort>=5.5.4", diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py index 88535b44e9c4..8e87ead7fdd5 100644 --- a/src/transformers/generation/flax_utils.py +++ b/src/transformers/generation/flax_utils.py @@ -398,7 +398,11 @@ def generate( ) generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length else: # by default let's always generate 10 new tokens - generation_config.max_length = generation_config.max_length + input_ids_seq_length + if generation_config.max_length == GenerationConfig().max_length: + generation_config.max_length = generation_config.max_length + input_ids_seq_length + max_position_embeddings = getattr(self.config, "max_position_embeddings", None) + if max_position_embeddings is not None: + generation_config.max_length = min(generation_config.max_length, max_position_embeddings) if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length: raise ValueError( diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 6e6d5b8bdce7..53cd2df3a49c 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1452,10 +1452,11 @@ def _prepare_generated_length( ): generation_config.max_length -= inputs_tensor.shape[1] elif has_default_max_length: # by default let's always generate 20 new tokens - generation_config.max_length = generation_config.max_length + input_ids_length - max_position_embeddings = getattr(self.config, "max_position_embeddings", None) - if max_position_embeddings is not None: - generation_config.max_length = min(generation_config.max_length, max_position_embeddings) + if generation_config.max_length == GenerationConfig().max_length: + generation_config.max_length = generation_config.max_length + input_ids_length + max_position_embeddings = getattr(self.config, "max_position_embeddings", None) + if max_position_embeddings is not None: + generation_config.max_length = min(generation_config.max_length, max_position_embeddings) # same for min length if generation_config.min_new_tokens is not None: diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index b24c4c9e0d5c..4b236b9155f1 100755 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -918,7 +918,7 @@ def on_train_end(self, args, state, control, model=None, tokenizer=None, **kwarg if self._log_model.is_enabled and self._initialized and state.is_world_process_zero: from ..trainer import Trainer - fake_trainer = Trainer(args=args, model=model, processing_class=tokenizer) + fake_trainer = Trainer(args=args, model=model, processing_class=tokenizer, eval_dataset=["fake"]) with tempfile.TemporaryDirectory() as temp_dir: fake_trainer.save_model(temp_dir) metadata = ( diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 0df59d1db8e0..7672df0b9a0e 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -52,9 +52,11 @@ find_pruneable_heads_and_indices, id_tensor_storage, is_torch_greater_or_equal_than_1_13, + is_torch_greater_or_equal_than_2_4, prune_conv1d_layer, prune_layer, prune_linear_layer, + translate_to_torch_parallel_style, ) from .quantizers import AutoHfQuantizer, HfQuantizer from .quantizers.quantizers_utils import get_module_from_name @@ -94,7 +96,7 @@ replace_return_docstrings, strtobool, ) -from .utils.hub import convert_file_size_to_int, create_and_tag_model_card, get_checkpoint_shard_files +from .utils.hub import create_and_tag_model_card, get_checkpoint_shard_files from .utils.import_utils import ( ENV_VARS_TRUE_VALUES, is_sagemaker_mp_enabled, @@ -381,92 +383,6 @@ def check_support_param_buffer_assignment(model_to_load, state_dict, start_prefi return False -def shard_checkpoint( - state_dict: Dict[str, torch.Tensor], max_shard_size: Union[int, str] = "10GB", weights_name: str = WEIGHTS_NAME -): - """ - Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a - given size. - - The sub-checkpoints are determined by iterating through the `state_dict` in the order of its keys, so there is no - optimization made to make each sub-checkpoint as close as possible to the maximum size passed. For example, if the - limit is 10GB and we have weights of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB], - [6+2+2GB] and not [6+2+2GB], [6+2GB], [6GB]. - - - - If one of the model's weight is bigger than `max_shard_size`, it will end up in its own sub-checkpoint which will - have a size greater than `max_shard_size`. - - - - Args: - state_dict (`Dict[str, torch.Tensor]`): The state dictionary of a model to save. - max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`): - The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit - (like `"5MB"`). - weights_name (`str`, *optional*, defaults to `"pytorch_model.bin"`): - The name of the model save file. - """ - logger.warning( - "Note that `shard_checkpoint` is deprecated and will be removed in v4.44. We recommend you using " - "split_torch_state_dict_into_shards from huggingface_hub library" - ) - max_shard_size = convert_file_size_to_int(max_shard_size) - - sharded_state_dicts = [{}] - last_block_size = 0 - total_size = 0 - storage_id_to_block = {} - - for key, weight in state_dict.items(): - # when bnb serialization is used the weights in the state dict can be strings - # check: https://github.com/huggingface/transformers/pull/24416 for more details - if isinstance(weight, str): - continue - else: - storage_id = id_tensor_storage(weight) - - # If a `weight` shares the same underlying storage as another tensor, we put `weight` in the same `block` - if storage_id in storage_id_to_block and weight.device != torch.device("meta"): - block_id = storage_id_to_block[storage_id] - sharded_state_dicts[block_id][key] = weight - continue - - weight_size = weight.numel() * dtype_byte_size(weight.dtype) - # If this weight is going to tip up over the maximal size, we split, but only if we have put at least one - # weight in the current shard. - if last_block_size + weight_size > max_shard_size and len(sharded_state_dicts[-1]) > 0: - sharded_state_dicts.append({}) - last_block_size = 0 - - sharded_state_dicts[-1][key] = weight - last_block_size += weight_size - total_size += weight_size - storage_id_to_block[storage_id] = len(sharded_state_dicts) - 1 - - # If we only have one shard, we return it - if len(sharded_state_dicts) == 1: - return {weights_name: sharded_state_dicts[0]}, None - - # Otherwise, let's build the index - weight_map = {} - shards = {} - for idx, shard in enumerate(sharded_state_dicts): - shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin") - shard_file = shard_file.replace( - ".safetensors", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.safetensors" - ) - shards[shard_file] = shard - for key in shard.keys(): - weight_map[key] = shard_file - - # Add the metadata - metadata = {"total_size": total_size} - index = {"metadata": metadata, "weight_map": weight_map} - return shards, index - - def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True): """ This is the same as @@ -1013,7 +929,10 @@ def _load_state_dict_into_meta_model( param_to = "cpu" if is_fsdp_enabled() and not is_local_dist_rank_0(): param_to = "meta" - value = type(value)(value.data.to(param_to), **value.__dict__) + val_kwargs = {} + if hasattr(module, "weight") and module.weight.__class__.__name__ == "Int8Params": + val_kwargs["requires_grad"] = False + value = type(value)(value.data.to(param_to), **val_kwargs, **value.__dict__) setattr(module, tensor_name, value) # TODO: consider removing used param_parts from state_dict before return @@ -1409,6 +1328,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix # Has support for a `QuantoQuantizedCache` instance as `past_key_values` _supports_quantized_cache = False + # A tensor parallel plan to be applied to the model when TP is enabled. For + # top-level models, this attribute is currently defined in respective model + # code. For base models, this attribute comes from + # `config.base_model_tp_plan` during `post_init`. + _tp_plan = None + @property def dummy_inputs(self) -> Dict[str, torch.Tensor]: """ @@ -1453,6 +1378,9 @@ def post_init(self): """ self.init_weights() self._backward_compatibility_gradient_checkpointing() + # If current model is a base model, attach `base_model_tp_plan` from config + if self.base_model is self: + self._tp_plan = self.config.base_model_tp_plan def dequantize(self): """ @@ -3482,6 +3410,11 @@ def from_pretrained( # Cache path to the GGUF file gguf_path = None + tp_plan = kwargs.pop("tp_plan", None) + if tp_plan is not None and tp_plan != "auto": + # TODO: we can relax this check when we support taking tp_plan from a json file, for example. + raise ValueError(f"tp_plan supports 'auto' only for now but got {tp_plan}.") + if is_fsdp_enabled(): low_cpu_mem_usage = True @@ -4083,6 +4016,7 @@ def from_pretrained( # Instantiate model. init_contexts = [no_init_weights(_enable=_fast_init)] + tp_device = None if is_deepspeed_zero3_enabled() and not is_quantized: import deepspeed @@ -4095,6 +4029,16 @@ def from_pretrained( f"Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`" ) init_contexts.append(init_empty_weights()) + elif tp_plan is not None: + if not torch.distributed.is_initialized(): + raise ValueError("Tensor Parallel requires torch.distributed to be initialized first.") + + # Detect the accelerator on the machine. If no accelerator is available, it returns CPU. + device_type = torch._C._get_accelerator().type + device_module = torch.get_device_module(device_type) + # Get device with index assuming equal number of devices per host + tp_device = torch.device(device_type, torch.distributed.get_rank() % device_module.device_count()) + init_contexts.append(tp_device) if is_deepspeed_zero3_enabled() and is_quantized: init_contexts.append(set_quantized_state()) @@ -4228,32 +4172,38 @@ def from_pretrained( if dtype_orig is not None: torch.set_default_dtype(dtype_orig) - ( - model, - missing_keys, - unexpected_keys, - mismatched_keys, - offload_index, - error_msgs, - ) = cls._load_pretrained_model( - model, - state_dict, - loaded_state_dict_keys, # XXX: rename? - resolved_archive_file, - pretrained_model_name_or_path, - ignore_mismatched_sizes=ignore_mismatched_sizes, - sharded_metadata=sharded_metadata, - _fast_init=_fast_init, - low_cpu_mem_usage=low_cpu_mem_usage, - device_map=device_map, - offload_folder=offload_folder, - offload_state_dict=offload_state_dict, - dtype=torch_dtype, - hf_quantizer=hf_quantizer, - keep_in_fp32_modules=keep_in_fp32_modules, - gguf_path=gguf_path, - weights_only=weights_only, - ) + load_contexts = [] + # Make sure we load onto targeted device + if tp_device is not None: + load_contexts.append(tp_device) + + with ContextManagers(load_contexts): + ( + model, + missing_keys, + unexpected_keys, + mismatched_keys, + offload_index, + error_msgs, + ) = cls._load_pretrained_model( + model, + state_dict, + loaded_state_dict_keys, # XXX: rename? + resolved_archive_file, + pretrained_model_name_or_path, + ignore_mismatched_sizes=ignore_mismatched_sizes, + sharded_metadata=sharded_metadata, + _fast_init=_fast_init, + low_cpu_mem_usage=low_cpu_mem_usage, + device_map=device_map, + offload_folder=offload_folder, + offload_state_dict=offload_state_dict, + dtype=torch_dtype, + hf_quantizer=hf_quantizer, + keep_in_fp32_modules=keep_in_fp32_modules, + gguf_path=gguf_path, + weights_only=weights_only, + ) # make sure token embedding weights are still tied if needed model.tie_weights() @@ -4337,6 +4287,16 @@ def from_pretrained( } return model, loading_info + if tp_plan is not None: + assert tp_device is not None, "tp_device not set!" + if not model.supports_tp_plan: + raise NotImplementedError("This model does not have a tensor parallel plan.") + # Assuming sharding the model onto the world + world_size = torch.distributed.get_world_size() + device_mesh = torch.distributed.init_device_mesh(tp_device.type, (world_size,)) + # Apply Tensor Parallelism + model.tensor_parallel(device_mesh) + return model @classmethod @@ -5026,6 +4986,56 @@ def _is_quantized_training_enabled(self): return self.hf_quantizer.is_trainable + @property + def supports_tp_plan(self): + """ + Returns whether the model has a tensor parallelism plan. + """ + if self._tp_plan is not None: + return True + # Check if base model has a TP plan + if getattr(self.base_model, "_tp_plan", None) is not None: + return True + return False + + def tensor_parallel(self, device_mesh): + """ + Tensor parallelize the model across the given device mesh. + + Args: + device_mesh (`torch.distributed.DeviceMesh`): + The device mesh to use for tensor parallelism. + """ + if not is_torch_greater_or_equal_than_2_4: + raise EnvironmentError("tensor parallel is only supported for `torch>=2.5`.") + + # Tensor parallelize a nn.Module based on the `_tp_plan` attribute of the module. + # No op if `_tp_plan` attribute does not exist under the module. + # This is a helper function to be used with `model.apply` to recursively + # parallelize a model. + def tplize(mod: torch.nn.Module) -> None: + tp_plan = getattr(mod, "_tp_plan", None) + if tp_plan is None: + return + logger.debug(f"Applying tensor parallel to {mod.__class__.__name__}: {tp_plan}") + # In model configs, we use a neutral type (string) to specify + # parallel styles, here we translate them into torch TP types. + # Using tree_map because `tp_plan` is a dict. + tp_plan = torch.utils._pytree.tree_map( + translate_to_torch_parallel_style, + tp_plan, + ) + # Apply TP to current module. + torch.distributed.tensor.parallel.parallelize_module( + mod, + device_mesh=device_mesh, + parallelize_plan=tp_plan, + ) + + # `apply` is a native method of `nn.Module` that recursively applies a + # function to every submodule. + self.apply(tplize) + @property def loss_function(self): if getattr(self.config, "loss_type", None) is not None: diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 9155f629e63f..0d4b9f2f94de 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -177,6 +177,7 @@ nougat, nystromformer, olmo, + olmo_1124, olmoe, omdet_turbo, oneformer, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 48625ea3f346..7f0182b50085 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -195,6 +195,7 @@ ("nougat", "VisionEncoderDecoderConfig"), ("nystromformer", "NystromformerConfig"), ("olmo", "OlmoConfig"), + ("olmo_1124", "Olmo1124Config"), ("olmoe", "OlmoeConfig"), ("omdet-turbo", "OmDetTurboConfig"), ("oneformer", "OneFormerConfig"), @@ -510,6 +511,7 @@ ("nougat", "Nougat"), ("nystromformer", "Nyströmformer"), ("olmo", "OLMo"), + ("olmo_1124", "OLMo November 2024"), ("olmoe", "OLMoE"), ("omdet-turbo", "OmDet-Turbo"), ("oneformer", "OneFormer"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 67c539fca664..5206972b72ef 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -184,6 +184,7 @@ ("nllb-moe", "NllbMoeModel"), ("nystromformer", "NystromformerModel"), ("olmo", "OlmoModel"), + ("olmo_1124", "Olmo1124Model"), ("olmoe", "OlmoeModel"), ("omdet-turbo", "OmDetTurboForObjectDetection"), ("oneformer", "OneFormerModel"), @@ -516,6 +517,7 @@ ("mvp", "MvpForCausalLM"), ("nemotron", "NemotronForCausalLM"), ("olmo", "OlmoForCausalLM"), + ("olmo_1124", "Olmo1124ForCausalLM"), ("olmoe", "OlmoeForCausalLM"), ("open-llama", "OpenLlamaForCausalLM"), ("openai-gpt", "OpenAIGPTLMHeadModel"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 7674ea51a533..4ed67df0e84b 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -348,6 +348,7 @@ ), ), ("olmo", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), + ("olmo_1124", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), ("olmoe", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), ( "omdet-turbo", diff --git a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py b/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py index 5f972353c4f4..d6640045b80c 100644 --- a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py +++ b/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py @@ -249,7 +249,7 @@ def convert_blip2_checkpoint( {"image": original_pixel_values, "text_input": [caption]}, match_head="itm" ) logits = hf_model( - pixel_values=original_pixel_values, + pixel_values=pixel_values, input_ids=input_ids, attention_mask=attention_mask, use_image_text_matching_head=True, @@ -274,7 +274,7 @@ def convert_blip2_checkpoint( {"image": original_pixel_values, "text_input": [caption]}, match_head="itc" ) logits = hf_model( - pixel_values=original_pixel_values, + pixel_values=pixel_values, input_ids=input_ids, attention_mask=attention_mask, use_image_text_matching_head=False, diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index 08e42d1c8f70..d34528b74314 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -2203,7 +2203,7 @@ def forward( logger.warning_once( "Expanding inputs for image tokens in BLIP-2 should be done in processing. " "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) attention_mask = torch.cat( @@ -2326,7 +2326,7 @@ def generate( logger.warning_once( "Expanding inputs for image tokens in BLIP-2 should be done in processing. " "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) attention_mask = torch.cat( diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py index c68523784128..4129920f9b36 100644 --- a/src/transformers/models/blip_2/processing_blip_2.py +++ b/src/transformers/models/blip_2/processing_blip_2.py @@ -153,7 +153,7 @@ def __call__( logger.warning_once( "Expanding inputs for image tokens in BLIP-2 should be done in processing. " "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) # cast to desired return tensors type diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index b215fb6561bf..0261f997da11 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -1068,7 +1068,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( return causal_mask -# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->Cohere +# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->Cohere class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin): _tied_weights_keys = ["lm_head.weight"] diff --git a/src/transformers/models/esm/configuration_esm.py b/src/transformers/models/esm/configuration_esm.py index 9634a20015f2..083664747ddd 100644 --- a/src/transformers/models/esm/configuration_esm.py +++ b/src/transformers/models/esm/configuration_esm.py @@ -87,11 +87,14 @@ class EsmConfig(PretrainedConfig): ```python >>> from transformers import EsmModel, EsmConfig - >>> # Initializing a ESM facebook/esm-1b style configuration >>> configuration = EsmConfig() + >>> # Initializing a ESM facebook/esm-1b style configuration + >>> configuration = EsmConfig(vocab_size=33) - >>> # Initializing a model from the configuration >>> model = ESMModel(configuration) + >>> # Initializing a model from the configuration + >>> model = EsmModel(configuration) - >>> # Accessing the model configuration >>> configuration = model.config + >>> # Accessing the model configuration + >>> configuration = model.config ```""" model_type = "esm" diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index fa3fadc4349a..6fead73eced7 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -720,7 +720,10 @@ def __init__(self, config: GemmaConfig): [GemmaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.gradient_checkpointing = False + if getattr(config, "pretraining_tp", 1) != 1: + logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.") # Initialize weights and apply final processing self.post_init() @@ -982,6 +985,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( class GemmaForCausalLM(GemmaPreTrainedModel, GenerationMixin): _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index 626e5537fc06..6a3d8f27fb17 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -740,7 +740,10 @@ def __init__(self, config: Gemma2Config): [Gemma2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.gradient_checkpointing = False + if getattr(config, "pretraining_tp", 1) != 1: + logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.") # Initialize weights and apply final processing self.post_init() @@ -961,6 +964,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin): _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index 248ec4021791..58a89d90b44f 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -708,6 +708,8 @@ def __init__(self, config: GlmConfig): dim=config.head_dim // 2, max_position_embeddings=config.max_position_embeddings, base=config.rope_theta ) self.gradient_checkpointing = False + if getattr(config, "pretraining_tp", 1) != 1: + logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.") # Initialize weights and apply final processing self.post_init() @@ -967,6 +969,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( class GlmForCausalLM(GlmPreTrainedModel, GenerationMixin): _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} def __init__(self, config: GlmConfig): super().__init__(config) diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index a78a3b668774..e5622185bc39 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -1471,7 +1471,7 @@ def forward( logger.warning_once( "Expanding inputs for image tokens in InstructBLIP should be done in processing. " "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) attention_mask = torch.cat( @@ -1610,7 +1610,7 @@ def generate( logger.warning_once( "Expanding inputs for image tokens in InstructBLIP should be done in processing. " "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) attention_mask = torch.cat( diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py index 3d48839d376c..a96d97fb07e1 100644 --- a/src/transformers/models/instructblip/processing_instructblip.py +++ b/src/transformers/models/instructblip/processing_instructblip.py @@ -148,7 +148,7 @@ def __call__( logger.warning_once( "Expanding inputs for image tokens in InstructBLIP should be done in processing. " "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) # cast to desired return tensors type after concatenating diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index a3667e065345..98d5ecdd2a4f 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -141,6 +141,16 @@ class LlamaConfig(PretrainedConfig): model_type = "llama" keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `LlamaModel` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } def __init__( self, diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 4d95f01849d6..679296648a91 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -21,7 +21,6 @@ from typing import List, Optional, Tuple, Union import torch -import torch.nn.functional as F import torch.utils.checkpoint from torch import nn @@ -240,25 +239,7 @@ def __init__(self, config): self.act_fn = ACT2FN[config.hidden_act] def forward(self, x): - if self.config.pretraining_tp > 1: - slice = self.intermediate_size // self.config.pretraining_tp - gate_proj_slices = self.gate_proj.weight.split(slice, dim=0) - up_proj_slices = self.up_proj.weight.split(slice, dim=0) - down_proj_slices = self.down_proj.weight.split(slice, dim=1) - - gate_proj = torch.cat( - [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1 - ) - up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1) - - intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2) - down_proj = [ - F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp) - ] - down_proj = sum(down_proj) - else: - down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) return down_proj @@ -320,31 +301,14 @@ def forward( ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() - if self.config.pretraining_tp > 1: - key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp - query_slices = self.q_proj.weight.split( - (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0 - ) - key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) - value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) - - query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)] - query_states = torch.cat(query_states, dim=-1) - - key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)] - key_states = torch.cat(key_states, dim=-1) - - value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)] - value_states = torch.cat(value_states, dim=-1) - - else: - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) if position_embeddings is None: logger.warning_once( @@ -386,12 +350,7 @@ def forward( attn_output = attn_output.reshape(bsz, q_len, -1) - if self.config.pretraining_tp > 1: - attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) - o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1) - attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)]) - else: - attn_output = self.o_proj(attn_output) + attn_output = self.o_proj(attn_output) if not output_attentions: attn_weights = None @@ -564,9 +523,10 @@ def forward( key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) if position_embeddings is None: logger.warning_once( @@ -850,7 +810,10 @@ def __init__(self, config: LlamaConfig): ) self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.rotary_emb = LlamaRotaryEmbedding(config=config) + self.gradient_checkpointing = False + if getattr(config, "pretraining_tp", 1) != 1: + logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.") # Initialize weights and apply final processing self.post_init() @@ -1113,6 +1076,7 @@ class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ... class LlamaForCausalLM(LlamaPreTrainedModel, GenerationMixin): _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} def __init__(self, config): super().__init__(config) @@ -1211,13 +1175,8 @@ def forward( ) hidden_states = outputs[0] - if self.config.pretraining_tp > 1: - lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0) - logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)] - logits = torch.cat(logits, dim=-1) - else: - # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) loss = None if labels is not None: diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 6d6bf4a6f38e..e8536ee50f94 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -485,7 +485,7 @@ def forward( "Expanding inputs for image tokens in LLaVa should be done in processing. " "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) # prefill stage vs decoding stage (legacy behavior copied) if input_ids.shape[1] != 1: diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py index 0ff40acc4052..08caa3d1d8a7 100644 --- a/src/transformers/models/llava/processing_llava.py +++ b/src/transformers/models/llava/processing_llava.py @@ -58,10 +58,19 @@ class LlavaProcessor(ProcessorMixin): in a chat into a tokenizable string. image_token (`str`, *optional*, defaults to `""`): Special token used to denote image location. + num_additional_image_tokens (`int`, *optional*, defaults to 0): + Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other + extra tokens appended, no need to set this arg. """ attributes = ["image_processor", "tokenizer"] - valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token"] + valid_kwargs = [ + "chat_template", + "patch_size", + "vision_feature_select_strategy", + "image_token", + "num_additional_image_tokens", + ] image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" @@ -73,9 +82,11 @@ def __init__( vision_feature_select_strategy=None, chat_template=None, image_token="", # set the default and let users change if they have peculiar special tokens in rare cases + num_additional_image_tokens=0, **kwargs, ): self.patch_size = patch_size + self.num_additional_image_tokens = num_additional_image_tokens self.vision_feature_select_strategy = vision_feature_select_strategy self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token super().__init__(image_processor, tokenizer, chat_template=chat_template) @@ -147,9 +158,11 @@ def __call__( # Replace the image token with the expanded image token sequence pixel_values = image_inputs["pixel_values"] height, width = get_image_size(to_numpy_array(pixel_values[0])) - num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1 + num_image_tokens = (height // self.patch_size) * ( + width // self.patch_size + ) + self.num_additional_image_tokens if self.vision_feature_select_strategy == "default": - num_image_tokens -= 1 + num_image_tokens -= self.num_additional_image_tokens prompt_strings = [] for sample in text: @@ -160,7 +173,7 @@ def __call__( "Expanding inputs for image tokens in LLaVa should be done in processing. " "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 2d23c48225cd..269663c7d614 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -868,7 +868,7 @@ def forward( "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. " "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) if input_ids.shape[1] != 1: inputs_embeds = inputs_embeds.to(image_features.dtype) diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py index 310083c1ce53..09f9e621a587 100644 --- a/src/transformers/models/llava_next/processing_llava_next.py +++ b/src/transformers/models/llava_next/processing_llava_next.py @@ -61,10 +61,19 @@ class LlavaNextProcessor(ProcessorMixin): in a chat into a tokenizable string. image_token (`str`, *optional*, defaults to `""`): Special token used to denote image location. + num_additional_image_tokens (`int`, *optional*, defaults to 0): + Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other + extra tokens appended, no need to set this arg. """ attributes = ["image_processor", "tokenizer"] - valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token"] + valid_kwargs = [ + "chat_template", + "patch_size", + "vision_feature_select_strategy", + "image_token", + "num_additional_image_tokens", + ] image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" @@ -76,9 +85,11 @@ def __init__( vision_feature_select_strategy=None, chat_template=None, image_token="", # set the default and let users change if they have peculiar special tokens in rare cases + num_additional_image_tokens=0, **kwargs, ): self.patch_size = patch_size + self.num_additional_image_tokens = num_additional_image_tokens self.vision_feature_select_strategy = vision_feature_select_strategy self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token super().__init__(image_processor, tokenizer, chat_template=chat_template) @@ -143,7 +154,7 @@ def __call__( "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. " "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) else: image_sizes = iter(image_inputs["image_sizes"]) @@ -155,7 +166,7 @@ def __call__( orig_height, orig_width = image_size num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) if self.vision_feature_select_strategy == "default": - num_image_tokens -= 1 + num_image_tokens -= self.num_additional_image_tokens sample = sample.replace(self.image_token, "" * num_image_tokens, 1) prompt_strings.append(sample) prompt_strings = [sample.replace("", self.image_token) for sample in prompt_strings] @@ -178,7 +189,7 @@ def _get_number_of_features(self, orig_height: int, orig_width: int, height: int orig_height, orig_width, patches_height, patches_width, scale_height, scale_width ) # The base patch covers the entire image (+1 for the CLS) - base_features = patches_height * patches_width + 1 + base_features = patches_height * patches_width + self.num_additional_image_tokens num_image_tokens = unpadded_features + newline_features + base_features return num_image_tokens diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index a42aafcadd64..db4999a2a8ae 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -58,12 +58,22 @@ class LlavaNextVideoProcessor(ProcessorMixin): Special token used to denote video location. image_token (`str`, *optional*, defaults to `""`): Special token used to denote image location. + num_additional_image_tokens (`int`, *optional*, defaults to 0): + Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other + extra tokens appended, no need to set this arg. """ # video and image processor share same args, but have different processing logic # only image processor config is saved in the hub attributes = ["video_processor", "image_processor", "tokenizer"] - valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token", "video_token"] + valid_kwargs = [ + "chat_template", + "patch_size", + "vision_feature_select_strategy", + "image_token", + "video_token", + "num_additional_image_tokens", + ] image_processor_class = "LlavaNextImageProcessor" video_processor_class = "LlavaNextVideoImageProcessor" tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast") @@ -78,9 +88,11 @@ def __init__( vision_feature_select_strategy=None, video_token="", padding_side="right") + EXPECTED_TEXT_COMPLETION = [ + "Simply put, the theory of relativity states that 1) the speed of light is constant, 2) the speed of light", + ] + max_generation_length = tokenizer(EXPECTED_TEXT_COMPLETION, return_tensors="pt", padding=True)[ + "input_ids" + ].shape[-1] + + # Load model + device = "cpu" + dtype = torch.bfloat16 + cache_implementation = "static" + attn_implementation = "sdpa" + batch_size = 1 + generation_config = GenerationConfig( + use_cache=True, + cache_implementation=cache_implementation, + max_length=max_generation_length, + cache_config={ + "batch_size": batch_size, + "max_cache_len": max_generation_length, + }, + ) + model = Olmo1124ForCausalLM.from_pretrained( + olmo_1124_model, + device_map=device, + torch_dtype=dtype, + attn_implementation=attn_implementation, + generation_config=generation_config, + ) + + prompts = ["Simply put, the theory of relativity states that "] + prompt_tokens = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) + prompt_token_ids = prompt_tokens["input_ids"] + max_new_tokens = max_generation_length - prompt_token_ids.shape[-1] + + # Static Cache + eager + eager_generated_ids = model.generate( + **prompt_tokens, max_new_tokens=max_new_tokens, do_sample=False, cache_implementation=cache_implementation + ) + eager_generated_text = tokenizer.batch_decode(eager_generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, eager_generated_text) + + # Static Cache + export + exported_program = convert_and_export_with_cache(model) + ep_generated_ids = TorchExportableModuleWithStaticCache.generate( + exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=max_new_tokens + ) + ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True) + self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text) diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py index 090907b164e8..14b079665ab6 100644 --- a/tests/models/video_llava/test_modeling_video_llava.py +++ b/tests/models/video_llava/test_modeling_video_llava.py @@ -625,12 +625,14 @@ def test_expansion_in_processing_images(self): # check processing with expansion of inputs processor.vision_feature_select_strategy = "default" processor.patch_size = 14 + processor.num_additional_image_tokens = 1 inputs_expanded = processor(prompt, images=image, return_tensors="pt").to(torch_device, torch.float16) self.assertTrue(inputs_expanded.input_ids.shape[-1] == 274) # check processing without expansion of inputs (legacy behavior) processor.vision_feature_select_strategy = None processor.patch_size = None + processor.num_additional_image_tokens = None inputs = processor(prompt, images=image, return_tensors="pt").to(torch_device, torch.float16) self.assertTrue(inputs.input_ids.shape[-1] == 19) @@ -657,12 +659,14 @@ def test_expansion_in_processing(self): # check processing with expansion of inputs processor.vision_feature_select_strategy = "default" processor.patch_size = 14 + processor.num_additional_image_tokens = 1 inputs_expanded = processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16) self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2074) # check processing without expansion of inputs (legacy behavior) processor.vision_feature_select_strategy = None processor.patch_size = None + processor.num_additional_image_tokens = None inputs = processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16) self.assertTrue(inputs.input_ids.shape[-1] == 19) diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py index 25670d782a98..4f501fc10a02 100644 --- a/tests/models/vipllava/test_modeling_vipllava.py +++ b/tests/models/vipllava/test_modeling_vipllava.py @@ -374,12 +374,14 @@ def test_expansion_in_processing(self): # check processing with expansion of inputs processor.vision_feature_select_strategy = "default" processor.patch_size = 14 + processor.num_additional_image_tokens = 1 inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16) self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593) # check processing without expansion of inputs (legacy behavior) processor.vision_feature_select_strategy = None processor.patch_size = None + processor.num_additional_image_tokens = None inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16) self.assertTrue(inputs.input_ids.shape[-1] == 18) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 94b5e175bf88..3ef30fc8ae55 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -849,29 +849,29 @@ def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=No ): self.skipTest(reason=f"`supports_gradient_checkpointing` is False for {model_class.__name__}.") - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - config.use_cache = False - config.return_dict = True - model = model_class(config) + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.use_cache = False + config.return_dict = True + model = model_class(config) - model.to(torch_device) - model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) - model.train() + model.to(torch_device) + model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) + model.train() - # unfreeze additional layers - for p in model.parameters(): - p.requires_grad_(True) + # unfreeze additional layers + for p in model.parameters(): + p.requires_grad_(True) - optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) - inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - loss = model(**inputs).loss - loss.backward() - optimizer.step() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + optimizer.step() - for k, v in model.named_parameters(): - if v.requires_grad: - self.assertTrue(v.grad is not None, f"{k} in {model_class.__name__} has no gradient!") + for k, v in model.named_parameters(): + if v.requires_grad: + self.assertTrue(v.grad is not None, f"{k} in {model_class.__name__} has no gradient!") def test_training(self): if not self.model_tester.is_training: diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index a3bbbf3c9e97..f04a4255556b 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -1461,6 +1461,38 @@ def test_continue_final_message(self): "<|im_start|>system\nsystem message<|im_end|>\n<|im_start|>user\nuser message<|im_end|>\n<|im_start|>assistant\nassistant message", ) + @require_jinja + def test_continue_final_message_with_trim(self): + """Regression test for chat templates with trimming: https://github.com/huggingface/transformers/pull/34214""" + + dummy_template = """ + {%- for message in messages %} + {{- "<|im_start|>" + message['role'] + "\n" + message['content'] | trim + "<|im_end|>" + "\n"}} + {%- endfor %}""" + dummy_conversation = [ + {"role": "system", "content": "system message"}, + {"role": "user", "content": "user message"}, + {"role": "assistant", "content": "assistant message "}, # Note the trailing whitespace + ] + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + output = tokenizer.apply_chat_template( + dummy_conversation, chat_template=dummy_template, tokenize=False, continue_final_message=False + ) + self.assertEqual( + output, + "<|im_start|>system\nsystem message<|im_end|>\n<|im_start|>user\nuser message<|im_end|>\n<|im_start|>assistant\nassistant message<|im_end|>\n", + ) + prefill_output = tokenizer.apply_chat_template( + dummy_conversation, chat_template=dummy_template, tokenize=False, continue_final_message=True + ) + # Assert that the final message is unterminated + self.assertEqual( + prefill_output, + "<|im_start|>system\nsystem message<|im_end|>\n<|im_start|>user\nuser message<|im_end|>\n<|im_start|>assistant\nassistant message", + ) + @require_jinja def test_chat_template_dict(self): dummy_template_1 = "{{'a'}}" diff --git a/tests/tp/test_tp.py b/tests/tp/test_tp.py new file mode 100644 index 000000000000..2139a648867b --- /dev/null +++ b/tests/tp/test_tp.py @@ -0,0 +1,91 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from transformers import is_torch_available +from transformers.models.llama.configuration_llama import LlamaConfig +from transformers.models.llama.modeling_llama import LlamaModel +from transformers.testing_utils import ( + TestCasePlus, + execute_subprocess_async, + get_torch_dist_unique_port, + require_torch_multi_gpu, +) + + +if is_torch_available(): + import torch + + +class TestTensorParallel(TestCasePlus): + @require_torch_multi_gpu + def test_tp(self): + distributed_args = f"""--nproc_per_node={torch.cuda.device_count()} + --master_port={get_torch_dist_unique_port()} + {self.test_file_dir}/test_tp.py + """.split() + output_dir = self.get_auto_remove_tmp_dir() + args = f"--output_dir {output_dir} --report_to none".split() + cmd = ["torchrun"] + distributed_args + args + print(cmd) + execute_subprocess_async(cmd, env=self.get_env()) + # successful return here == success - any errors would have caused an error in the sub-call + + +if __name__ == "__main__": + # The script below is meant to be run under torch.distributed, on a machine with multiple GPUs: + # CUDA_VISIBLE_DEVICES=0,1 RUN_SLOW=1 pytest -sv tests/tp/test_tp.py + # or + # PYTHONPATH="src" python -m torch.distributed.run --nproc_per_node 2 ./tests/tp/test_tp.py + + if not is_torch_available(): + exit(0) + + # Test settings + model_id = "meta-llama/Meta-Llama-3-8B-Instruct" + bs = 4 + seqlen = 64 + + # Get distributed settings + rank = int(os.environ["RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + + # Initialize distributed + device = torch.device(f"cuda:{rank}") + torch.distributed.init_process_group("nccl", device_id=device) + device_mesh = torch.distributed.init_device_mesh("cuda", (world_size,)) + + # Get model config + config = LlamaConfig.from_pretrained(model_id) + # Shrink model size + config.num_hidden_layers //= 8 + config.vocab_size //= 8 + + # Instantiate model + with device: + model = LlamaModel(config) + + model.eval() + + # Tensor Parallel + if world_size > 1: + model.tensor_parallel(device_mesh) + + # Run model + inputs = torch.randint(config.vocab_size, (bs, seqlen), device=device) + with torch.no_grad(): + out = model(inputs) + + assert out.last_hidden_state.shape == torch.Size([bs, seqlen, config.hidden_size]) diff --git a/tests/trainer/test_trainer_fsdp.py b/tests/trainer/test_trainer_fsdp.py index 4bcf5de04520..eca6a30664f0 100644 --- a/tests/trainer/test_trainer_fsdp.py +++ b/tests/trainer/test_trainer_fsdp.py @@ -117,6 +117,33 @@ def test_trainer(self): execute_subprocess_async(cmd, env=self.get_env()) # successful return here == success - any errors would have caused an error in the sub-call + class TestFSDPTrainerWrap(TestCasePlus): + @require_accelerate + @require_torch_multi_gpu + @require_fsdp + def test_trainer(self): + output_dir = self.get_auto_remove_tmp_dir() + cmd = [ + "accelerate", + "launch", + "--use_fsdp", + "--main_process_port", + f"{get_torch_dist_unique_port()}", + "--num_processes", + f"{torch.cuda.device_count()}", + "--fsdp_transformer_layer_cls_to_wrap", + "GPT2Block", + f"{self.test_file_dir}/test_trainer_fsdp.py", + "--output_dir", + f"{output_dir}", + "--report_to", + "none", + "--auto_find_batch_size", + "True", + ] + execute_subprocess_async(cmd, env=self.get_env()) + # successful return here == success - any errors would have caused an error in the sub-call + if __name__ == "__main__": parser = HfArgumentParser((Seq2SeqTrainingArguments,)) diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py index 5fd6251224c3..96a30df7e558 100644 --- a/tests/utils/test_modeling_utils.py +++ b/tests/utils/test_modeling_utils.py @@ -105,7 +105,6 @@ _find_disjoint, _find_identical, dtype_byte_size, - shard_checkpoint, ) from transformers.pytorch_utils import isin_mps_friendly @@ -668,71 +667,6 @@ def test_no_super_init_config_and_model(self): for p1, p2 in zip(model.parameters(), new_model.parameters()): self.assertTrue(torch.equal(p1, p2)) - def test_shard_checkpoint(self): - # This is the model we will use, total size 340,000 bytes. - model = torch.nn.Sequential( - torch.nn.Linear(100, 200, bias=False), # size 80,000 - torch.nn.Linear(200, 200, bias=False), # size 160,000 - torch.nn.Linear(200, 100, bias=False), # size 80,000 - torch.nn.Linear(100, 50, bias=False), # size 20,000 - ) - state_dict = model.state_dict() - - with self.subTest("No shard when max size is bigger than model size"): - shards, index = shard_checkpoint(state_dict) - self.assertIsNone(index) - self.assertDictEqual(shards, {WEIGHTS_NAME: state_dict}) - - with self.subTest("Test sharding, no weights bigger than max size"): - shards, index = shard_checkpoint(state_dict, max_shard_size="300kB") - # Split is first two layers then last two. - self.assertDictEqual( - index, - { - "metadata": {"total_size": 340000}, - "weight_map": { - "0.weight": "pytorch_model-00001-of-00002.bin", - "1.weight": "pytorch_model-00001-of-00002.bin", - "2.weight": "pytorch_model-00002-of-00002.bin", - "3.weight": "pytorch_model-00002-of-00002.bin", - }, - }, - ) - - shard1 = {"0.weight": state_dict["0.weight"], "1.weight": state_dict["1.weight"]} - shard2 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]} - self.assertDictEqual( - shards, {"pytorch_model-00001-of-00002.bin": shard1, "pytorch_model-00002-of-00002.bin": shard2} - ) - - with self.subTest("Test sharding with weights bigger than max size"): - shards, index = shard_checkpoint(state_dict, max_shard_size="100kB") - # Split is first layer, second layer then last 2. - self.assertDictEqual( - index, - { - "metadata": {"total_size": 340000}, - "weight_map": { - "0.weight": "pytorch_model-00001-of-00003.bin", - "1.weight": "pytorch_model-00002-of-00003.bin", - "2.weight": "pytorch_model-00003-of-00003.bin", - "3.weight": "pytorch_model-00003-of-00003.bin", - }, - }, - ) - - shard1 = {"0.weight": state_dict["0.weight"]} - shard2 = {"1.weight": state_dict["1.weight"]} - shard3 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]} - self.assertDictEqual( - shards, - { - "pytorch_model-00001-of-00003.bin": shard1, - "pytorch_model-00002-of-00003.bin": shard2, - "pytorch_model-00003-of-00003.bin": shard3, - }, - ) - def test_checkpoint_sharding_local_bin(self): model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")