diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index a65b8cafe562..eaa4b3b2f824 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -63,7 +63,6 @@ jobs: commit_id=$GITHUB_SHA fi commit_msg=$(git show -s --format=%s | cut -c1-70) - df -h python3 benchmark/llama.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg" env: HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} diff --git a/benchmark/init_db.sql b/benchmark/init_db.sql index 4381b99cea66..573cc11518e8 100644 --- a/benchmark/init_db.sql +++ b/benchmark/init_db.sql @@ -7,6 +7,10 @@ CREATE TABLE IF NOT EXISTS benchmarks ( created_at timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC') ); +CREATE INDEX IF NOT EXISTS benchmarks_benchmark_id_idx ON benchmarks (benchmark_id); + +CREATE INDEX IF NOT EXISTS benchmarks_branch_idx ON benchmarks (branch); + CREATE TABLE IF NOT EXISTS device_measurements ( measurement_id SERIAL PRIMARY KEY, benchmark_id int REFERENCES benchmarks (benchmark_id), @@ -17,6 +21,8 @@ CREATE TABLE IF NOT EXISTS device_measurements ( time timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC') ); +CREATE INDEX IF NOT EXISTS device_measurements_branch_idx ON device_measurements (benchmark_id); + CREATE TABLE IF NOT EXISTS model_measurements ( measurement_id SERIAL PRIMARY KEY, benchmark_id int REFERENCES benchmarks (benchmark_id), @@ -24,3 +30,4 @@ CREATE TABLE IF NOT EXISTS model_measurements ( time timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC') ); +CREATE INDEX IF NOT EXISTS model_measurements_branch_idx ON model_measurements (benchmark_id); diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index 7ad4e96d62cd..b597f5a73fb5 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -9,7 +9,7 @@ SHELL ["sh", "-lc"] # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant # to be used as arguments for docker build (so far). -ARG PYTORCH='2.4.0' +ARG PYTORCH='2.5.1' # (not always a valid torch version) ARG INTEL_TORCH_EXT='2.3.0' # Example: `cu102`, `cu113`, etc. diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile index 62578ad0f361..f22d77b9372d 100644 --- a/docker/transformers-pytorch-gpu/Dockerfile +++ b/docker/transformers-pytorch-gpu/Dockerfile @@ -11,7 +11,7 @@ ARG REF=main RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF # If set to nothing, will install the latest version -ARG PYTORCH='2.4.0' +ARG PYTORCH='2.5.1' ARG TORCH_VISION='' ARG TORCH_AUDIO='' # Example: `cu102`, `cu113`, etc. diff --git a/docs/TRANSLATING.md b/docs/TRANSLATING.md index 49747821f476..64dced450987 100644 --- a/docs/TRANSLATING.md +++ b/docs/TRANSLATING.md @@ -1,57 +1,70 @@ -### Translating the Transformers documentation into your language +# Translating the Transformers documentation into your language -As part of our mission to democratize machine learning, we'd love to make the Transformers library available in many more languages! Follow the steps below if you want to help translate the documentation into your language 🙏. +As part of our mission to democratize machine learning, we aim to make the Transformers library available in many more languages! Follow the steps below to help translate the documentation into your language. -**🗞️ Open an issue** +## Open an Issue -To get started, navigate to the [Issues](https://github.com/huggingface/transformers/issues) page of this repo and check if anyone else has opened an issue for your language. If not, open a new issue by selecting the "Translation template" from the "New issue" button. +1. Navigate to the Issues page of this repository. +2. Check if anyone has already opened an issue for your language. +3. If not, create a new issue by selecting the "Translation template" from the "New issue" button. +4. Post a comment indicating which chapters you’d like to work on, and we’ll add your name to the list. -Once an issue exists, post a comment to indicate which chapters you'd like to work on, and we'll add your name to the list. +## Fork the Repository +1. First, fork the Transformers repo by clicking the Fork button in the top-right corner. +2. Clone your fork to your local machine for editing with the following command: -**🍴 Fork the repository** + ```bash + git clone https://github.com/YOUR-USERNAME/transformers.git + ``` + + Replace `YOUR-USERNAME` with your GitHub username. -First, you'll need to [fork the Transformers repo](https://docs.github.com/en/get-started/quickstart/fork-a-repo). You can do this by clicking on the **Fork** button on the top-right corner of this repo's page. +## Copy-paste the English version with a new language code -Once you've forked the repo, you'll want to get the files on your local machine for editing. You can do that by cloning the fork with Git as follows: +The documentation files are organized in the following directory: -```bash -git clone https://github.com/YOUR-USERNAME/transformers.git -``` +- **docs/source**: This contains all documentation materials organized by language. -**📋 Copy-paste the English version with a new language code** +To copy the English version to your new language directory: -The documentation files are in one leading directory: +1. Navigate to your fork of the repository: -- [`docs/source`](https://github.com/huggingface/transformers/tree/main/docs/source): All the documentation materials are organized here by language. + ```bash + cd ~/path/to/transformers/docs + ``` -You'll only need to copy the files in the [`docs/source/en`](https://github.com/huggingface/transformers/tree/main/docs/source/en) directory, so first navigate to your fork of the repo and run the following: + Replace `~/path/to` with your actual path. -```bash -cd ~/path/to/transformers/docs -cp -r source/en source/LANG-ID -``` +2. Run the following command: -Here, `LANG-ID` should be one of the ISO 639-1 or ISO 639-2 language codes -- see [here](https://www.loc.gov/standards/iso639-2/php/code_list.php) for a handy table. + ```bash + cp -r source/en source/LANG-ID + ``` -**✍️ Start translating** + Replace `LANG-ID` with the appropriate ISO 639-1 or ISO 639-2 language code (see [this table](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) for reference). -The fun part comes - translating the text! +## Start translating -The first thing we recommend is translating the part of the `_toctree.yml` file that corresponds to your doc chapter. This file is used to render the table of contents on the website. +Begin translating the text! -> 🙋 If the `_toctree.yml` file doesn't yet exist for your language, you can create one by copy-pasting from the English version and deleting the sections unrelated to your chapter. Just make sure it exists in the `docs/source/LANG-ID/` directory! +1. Start with the `_toctree.yml` file that corresponds to your documentation chapter. This file is essential for rendering the table of contents on the website. -The fields you should add are `local` (with the name of the file containing the translation; e.g. `autoclass_tutorial`), and `title` (with the title of the doc in your language; e.g. `Load pretrained instances with an AutoClass`) -- as a reference, here is the `_toctree.yml` for [English](https://github.com/huggingface/transformers/blob/main/docs/source/en/_toctree.yml): + - If the `_toctree.yml` file doesn’t exist for your language, create one by copying the English version and removing unrelated sections. + - Ensure it is placed in the `docs/source/LANG-ID/` directory. -```yaml -- sections: - - local: pipeline_tutorial # Do not change this! Use the same name for your .md file - title: Pipelines for inference # Translate this! - ... - title: Tutorials # Translate this! -``` + Here’s an example structure for the `_toctree.yml` file: -Once you have translated the `_toctree.yml` file, you can start translating the [MDX](https://mdxjs.com/) files associated with your docs chapter. + ```yaml + - sections: + - local: pipeline_tutorial # Keep this name for your .md file + title: Pipelines for Inference # Translate this + ... + title: Tutorials # Translate this + ``` -> 🙋 If you'd like others to help you with the translation, you should [open an issue](https://github.com/huggingface/transformers/issues) and tag @stevhliu. +2. Once you’ve translated the `_toctree.yml`, move on to translating the associated MDX files. + +## Collaborate and share + +If you'd like assistance with your translation, open an issue and tag `@stevhliu`. Feel free to share resources or glossaries to ensure consistent terminology. diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml index bd45925c64cb..e66ed3381e2c 100644 --- a/docs/source/ar/_toctree.yml +++ b/docs/source/ar/_toctree.yml @@ -111,22 +111,22 @@ - sections: - local: fast_tokenizers title: استخدم مجزئيات النصوص السريعة من 🤗 Tokenizers -# - local: multilingual -# title: تشغيل الاستنتاج باستخدام نماذج متعددة اللغات -# - local: create_a_model -# title: استخدام واجهات برمجة التطبيقات الخاصة بالنموذج -# - local: custom_models -# title: مشاركة نموذج مخصص -# - local: chat_templating -# title: قوالب لنماذج الدردشة + - local: multilingual + title: الاستدلال باستخدام نماذج متعددة اللغات + - local: create_a_model + title: استخدام واجهات برمجة التطبيقات الخاصة بالنموذج + - local: custom_models + title: مشاركة نموذج مخصص + - local: chat_templating + title: قوالب لنماذج الدردشة # - local: trainer # title: المدرب # - local: sagemaker # title: تشغيل التدريب على Amazon SageMaker # - local: serialization # title: التصدير إلى ONNX -# - local: tflite -# title: التصدير إلى TFLite + - local: tflite + title: التصدير إلى TFLite # - local: torchscript # title: التصدير إلى TorchScript # - local: benchmarks @@ -137,8 +137,8 @@ # title: موارد المجتمع # - local: troubleshooting # title: استكشاف الأخطاء وإصلاحها -# - local: gguf -# title: التوافق مع ملفات GGUF + - local: gguf + title: التوافق مع ملفات GGUF title: أدلة المطورين # - sections: # - local: quantization/overview diff --git a/docs/source/ar/chat_templating.md b/docs/source/ar/chat_templating.md new file mode 100644 index 000000000000..90f4ac820e14 --- /dev/null +++ b/docs/source/ar/chat_templating.md @@ -0,0 +1,835 @@ +# قوالب نماذج الدردشة + +## مقدمة + +تعد **الدردشة** أحد استخدامات نماذج اللغات الكبيرة (LLMs) شائعة الاستخدام بشكل متزايد. ففي سياق الدردشة، وبدلاً من متابعة سلسلة نصية واحدة (كما هو الحال مع نماذج اللغات القياسية)، يواصل النموذج بدلاً من ذلك محادثة تتكون من رسالة واحدة أو أكثر، تتضمن كل منها دورًا، مثل "المستخدم" أو "المساعد"، بالإضافة إلى نص الرسالة. + +وكما هو الحال مع تقسيم النص إلى رموز (tokenization)، تتوقع النماذج المختلفة تنسيقات إدخال مختلفة تمامًا للمحادثة. لهذا السبب أضفنا **قوالب الدردشة** كميزة جديدة. تُعد قوالب المحادثة جزءًا من tokenizer. تحدد هذه القوالب كيفية تحويل المحادثات، والتي يتم تمثيلها كقوائم من الرسائل، إلى سلسلة نصية واحدة قابلة للتقسيم إلى رموز بالتنسيق الذي يتوقعه النموذج. + +دعونا نجعل هذا ملموسًا بمثال سريع باستخدام نموذج `BlenderBot`. لدى BlenderBot قالب افتراضي بسيط للغاية، والذي يضيف في الغالب مسافات بيضاء بين جولات الحوار: + +```python +>>> from transformers import AutoTokenizer +>>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") + +>>> chat = [ +... {"role": "user", "content": "Hello, how are you?"}, +... {"role": "assistant", "content": "I'm doing great. How can I help you today?"}, +... {"role": "user", "content": "I'd like to show off how chat templating works!"}, +... ] + +>>> tokenizer.apply_chat_template(chat, tokenize=False) +" Hello, how are you? I'm doing great. How can I help you today? I'd like to show off how chat templating works!" +``` + +لاحظ كيف تم ضغط الدردشة بأكملها في سلسلة واحدة. إذا استخدمنا `tokenize=True`، وهو الإعداد الافتراضي، فسيتم أيضًا تحليل السلسلة نحويًا نيابة عنا. ولكن، لنشاهد قالبًا أكثر تعقيدًا في العمل، دعونا نستخدم نموذج `mistralai/Mistral-7B-Instruct-v0.1`. + +```python +>>> from transformers import AutoTokenizer +>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1") + +>>> chat = [ +... {"role": "user", "content": "Hello, how are you?"}, +... {"role": "assistant", "content": "I'm doing great. How can I help you today?"}, +... {"role": "user", "content": "I'd like to show off how chat templating works!"}, +... ] + +>>> tokenizer.apply_chat_template(chat, tokenize=False) +"[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today? [INST] I'd like to show off how chat templating works! [/INST]" +``` + +لاحظ كيف أضاف المجزىء اللغوى tokenizer رموز التحكم `[INST]` و `[/INST]` للإشارة إلى بداية ونهاية رسائل المستخدم (ولكن ليس رسائل المساعد!) ، وتم تكثيف المحادثة بأكملها في سلسلة نصية واحدة. إذا استخدمنا `tokenize=True` ، وهو الإعداد الافتراضي ، فسيتم أيضًا تقسيم تلك السلسلة إلى رموز. + +حاول الآن استخدام نفس الشفرة، لكن مع استبدال النموذج بـ `HuggingFaceH4/zephyr-7b-beta` ، وستحصل على: +```text +<|user|> +Hello, how are you? +<|assistant|> +I'm doing great. How can I help you today? +<|user|> +I'd like to show off how chat templating works! +``` +تم ضبط كل من Zephyr و Mistral-Instruct من نفس النموذج الأصلي ، Mistral-7B-v0.1. ومع ذلك ، فقد تم تدريبهم بتنسيقات دردشة مختلفة تمامًا. بدون قوالب المحادثة، ستضطر إلى كتابة شفرة تنسيق يدويًا لكل نموذج ، ومن السهل جدًا ارتكاب أخطاء بسيطة تؤثر على الأداء! تُدير قوالب المحادثة تفاصيل التنسيق نيابةً عنك ، مما يُتيح لك كتابة شفرة عامة تعمل مع أي نموذج. + +## كيف أستخدم قوالب الدردشة؟ + +كما رأيت في المثال السابق، من السهل استخدام قوالب الدردشة. قم ببساطة بإنشاء قائمة من الرسائل، مع مفتاحي `role` و`content`، ثم قم بتمريرها إلى [`~PreTrainedTokenizer.apply_chat_template`] . بمجرد قيامك بذلك، ستحصل على مخرجات جاهزة للاستخدام! عند استخدام قوالب الدردشة كإدخال لتوليد نصوص بواسطة النموذج، فمن الجيد أيضًا استخدام `add_generation_prompt=True` لإضافة [مطالبات توليد النصوص](#what-are-generation-prompts). + +فيما يلي مثال على إعداد الإدخال لـ `model.generate()`، باستخدام Zephyr مرة أخرى: + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer + +checkpoint = "HuggingFaceH4/zephyr-7b-beta" +tokenizer = AutoTokenizer.from_pretrained(checkpoint) +model = AutoModelForCausalLM.from_pretrained(checkpoint) # قد ترغب في استخدام bfloat16 و/أو الانتقال إلى GPU هنا + +messages = [ + { + "role": "system", + "content": "You are a friendly chatbot who always responds in the style of a pirate", + }, + {"role": "user", "content": "How many helicopters can a human eat in one sitting?"}, + ] +tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt") +print(tokenizer.decode(tokenized_chat[0])) +``` +سيؤدي هذا إلى إنتاج سلسلة نصية بتنسيق الإدخال الذي يتوقعه Zephyr. + +```text +<|system|> +You are a friendly chatbot who always responds in the style of a pirate +<|user|> +How many helicopters can a human eat in one sitting? +<|assistant|> +``` + +الآن بعد أن تم تنسيق الإدخال بشكل صحيح لـ Zephyr، يمكننا استخدام النموذج لإنشاء رد على سؤال المستخدم: + +```python +outputs = model.generate(tokenized_chat, max_new_tokens=128) +print(tokenizer.decode(outputs[0])) +``` + +سيؤدي هذا إلى ما يلي: + +```text +<|system|> +You are a friendly chatbot who always responds in the style of a pirate +<|user|> +How many helicopters can a human eat in one sitting? +<|assistant|> +Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all. +``` + +كان ذلك سهلاً بعد كل شيء ! + + + +## هل هناك قنوات معالجة أوتوماتيكية للدردشة؟ + +نعم يوجد ! تدعم قنوات المعالجة توليد النصوص مدخلات الدردشة ، مما يُسهّل استخدام نماذج الدردشة . في الماضي ، كنا نستخدم فئة "ConversationalPipeline" المُخصّصة ، ولكن تم الآن إيقافها وتم دمج وظائفها في [`TextGenerationPipeline`]. دعونا نجرّب مثال Zephyr مرة أخرى ، ولكن هذه المرة باستخدام قناة معالجة: + +```python +from transformers import pipeline + +pipe = pipeline("text-generation", "HuggingFaceH4/zephyr-7b-beta") +messages = [ + { + "role": "system", + "content": "You are a friendly chatbot who always responds in the style of a pirate", + }, + {"role": "user", "content": "How many helicopters can a human eat in one sitting?"}, +] +print(pipe(messages, max_new_tokens=128)[0]['generated_text'][-1]) # طباعة استجابة المساعد +``` + +```النص +{'role': 'assistant', 'content': "Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all."} +``` + +سيُراعي قناة المعالجة جميع تفاصيل تقسيم النص إلى رموز واستدعاء apply_chat_template نيابةً عنك - بمجرد أن يصبح لِدى النموذج قالب دردشة ، فكل ما تحتاج إلى القيام به هو تهيئة قناة معالجة وتمرير قائمة الرسائل إليها! + +## ما هي "مطالبات التوليد"؟ + +قد تلاحظ أن طريقة `apply_chat_template` لها معامل `add_generation_prompt`. تخبر هذه المعامل القالب بإضافة رموز تشير إلى بداية رد البوت. على سبيل المثال، ضع في اعتبارك الدردشة التالية: + +```python +messages = [ + {"role": "user", "content": "Hi there!"}, + {"role": "assistant", "content": "Nice to meet you!"}, + {"role": "user", "content": "Can I ask a question?"} +] +``` + +إليك كيف سيبدو ذلك بدون موجه توليد نصوص ، بالنسبة لنموذج يستخدم تنسيق "ChatML" القياسي : + +```python +tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False) +"""<|im_start|>user +Hi there!<|im_end|> +<|im_start|>assistant +Nice to meet you!<|im_end|> +<|im_start|>user +Can I ask a question?<|im_end|> +""" +``` + +وهكذا يبدو الأمر **مع** مطالبة التوليد: + +```python +tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) +"""<|im_start|>user +Hi there!<|im_end|> +<|im_start|>assistant +Nice to meet you!<|im_end|> +<|im_start|>user +Can I ask a question?<|im_end|> +<|im_start|>assistant +""" +``` + +لاحظ أننا أضفنا هذه المرة الرموز التي تشير إلى بداية رد البوت. يضمن هذا أنه عندما يُولّد النموذج نصًا فسيكتب رد البوت بدلاً من القيام بشيء غير متوقع، مثل الاستمرار في رسالة المستخدم. تذكر، أن نماذج الدردشة لا تزال مجرد نماذج للغة - فهي مدربة على متابعة النصوص، والدردشة هي مجرد نوع خاص من النصوص بالنسبة لها! يجب توجيهها برموز تحكم مناسبة، حتى تعرف ما الذي يجب عليها فعله. + +لا تتطلب جميع النماذج الرموز التحكمية لتوليد نصوص . بعض النماذج ، مثل LLaMA ، ليس لديها أي رموز خاصة قبل ردود البوت . في هذه الحالات ، لن يكون لمعامل `add_generation_prompt` أي تأثير. يعتمد التأثير الدقيق الذي تُحدثه `add_generation_prompt` على القالب المستخدم . + +## ما وظيفة "continue_final_message"؟ + +عند تمرير قائمة من الرسائل إلى `apply_chat_template` أو `TextGenerationPipeline` ، يمكنك اختيار تنسيق المحادثة بحيث يواصل النموذج الرسالة الأخيرة في المحادثة بدلاً من بدء رسالة جديدة. يتم ذلك عن طريق إزالة أي رموز نهاية التسلسل التي تشير إلى نهاية الرسالة الأخيرة ، بحيث يقوم النموذج ببساطة بتمديد الرسالة الأخيرة عندما يبدأ في توليد النص . يُعد هذا أمرًا مفيدًا "لِمَلء بداية" رد النموذج مُسبقًا. + +وهنا مثال: +```python +chat = [ + {"role": "user", "content": "Can you format the answer in JSON?"}, + {"role": "assistant", "content": '{"name": "'}, +] + +formatted_chat = tokenizer.apply_chat_template(chat, tokenize=True, return_dict=True, continue_final_message=True) +model.generate(**formatted_chat) +``` +سيقوم النموذج بتوليد نص يكمل سلسلة JSON ، بدلاً من بدء رسالة جديدة . يمكن أن يكون هذا النهج مفيدًا جدًا لتحسين دقة اتباع النموذج للإرشادات عندما تعرف كيف تريد أن يبدأ ردوده . +. + +نظرًا لأن `add_generation_prompt` تضيف الرموز التي تبدأ رسالة جديدة ، و `continue_final_message` تزيل أي رموز نهاية الرسالة من الرسالة الأخيرة ، فليس من المنطقي استخدامهما معًا . ونتيجة لذلك ، ستتلقّى خطأً إذا حاولت ذلك ! + +السلوك الافتراضي لِـ `TextGenerationPipeline` هو تعيين `add_generation_prompt=True` بحيث تبدأ رسالة جديدة . ومع ذلك ، إذا كانت الرسالة الأخيرة في المحادثة التي تم إدخالها لديها دور "assistant" ، فسوف تفترض أن هذه الرسالة هي "مَلء بداية" وتتحوّل إلى `continue_final_message=True` بدلاً من ذلك ، لأن مُعظم النماذج لا تدعم عدة رسائل متتالية للمساعد . يمكنك تجاوز هذا السلوك عن طريق تمرير معامل `continue_final_message` بشكل صريح عند استدعاء قناة المعالجة . + + + +## هل يمكنني استخدام قوالب الدردشة في التدريب؟ + +نعم ! تُعد هذه طريقة جيدة للتأكد من أن قالب الدردشة يتطابق مع الرموز التي يراها النموذج أثناء التدريب . نوصي بتطبيق قالب الدردشة كخطوة معالجة أولية لمجموعة بياناتك . بعد ذلك ، يمكنك ببساطة متابعة عملية التدريب كما هو الحال مع أي مهمة تدريب نماذج لغات أخرى . عند التدريب ، يجب أن تُعيّن عادةً `add_generation_prompt=False` ، لأنه لن تكون الرموز المُضافة لتحفيز رد المساعد مفيدة أثناء التدريب . دعونا نرى مثالاً : + +```python +from transformers import AutoTokenizer +from datasets import Dataset + +tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") + +chat1 = [ + {"role": "user", "content": "Which is bigger, the moon or the sun?"}, + {"role": "assistant", "content": "The sun."} +] +chat2 = [ + {"role": "user", "content": "Which is bigger, a virus or a bacterium?"}, + {"role": "assistant", "content": "A bacterium."} +] + +dataset = Dataset.from_dict({"chat": [chat1, chat2]}) +dataset = dataset.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)}) +print(dataset['formatted_chat'][0]) +``` +ونحصل على: + +```text +<|user|> +Which is bigger, the moon or the sun? +<|assistant|> +The sun. +``` + +من هنا، استمر في التدريب كما تفعل مع مهمة نمذجة اللغة القياسية، باستخدام عمود `formatted_chat`. + + +بشكل افتراضي ، تضيف بعض *tokenizers* رموزًا خاصة مثل `` و `` إلى النص الذي تقوم بتقسيمه إلى رموز. يجب أن تتضمن قوالب المحادثة بالفعل جميع الرموز الخاصة التي تحتاجها ، وبالتالي فإن الرموز الخاصة الإضافية ستكون غالبًا غير صحيحة أو مُكررة ، مما سيؤثر سلبًا على أداء النموذج . + +لذلك ، إذا قمت بتنسيق النص باستخدام `apply_chat_template(tokenize=False)` ، فيجب تعيين المعامل `add_special_tokens=False` عندما تقوم بتقسيم ذلك النص إلى رموز لاحقًا . إذا كنت تستخدم `apply_chat_template(tokenize=True)` ، فلن تحتاج إلى القلق بشأن ذلك ! + + +## متقدّم: مدخلات إضافية لِقوالب الدردشة + + +المعامل الوحيدة التي تتطلبها طريقة `apply_chat_template` هي `messages`. ومع ذلك، يمكنك تمرير أي معامل ككلمة مفتاحية إلى `apply_chat_template` وستكون متاحة داخل القالب. يمنحك هذا الكثير من المرونة لاستخدام قوالب الدردشة للعديد من الأشياء. لا توجد قيود على أسماء هذه المعامﻻت أو تنسيقاتها - يمكنك تمرير سلاسل نصية أو قوائم أو قواميس أو أي شيء آخر تريده. + +ومع ذلك، هناك بعض الحالات الشائعة لاستخدام هذه المعامﻻت الإضافية، مثل تمرير أدوات لاستدعاء الوظائف، أو المستندات لإنشاء النصوص المُعزّزة بالاسترجاع. في هذه الحالات الشائعة، لدينا بعض التوصيات المُحدّدة حول أسماء هذه المعامﻻت وتنسيقاتها، والتي يتم وصفها في الأقسام التالية. نشجع مطوّري النماذج على جعل قوالب الدردشة الخاصة بهم متوافقة مع هذا التنسيق، لتسهيل نقل التعليمات البرمجية لاستدعاء الأدوات بين النماذج. + +## متقدم: استخدام الأداة / استدعاء الدالة + +يمكن لنماذج "استخدام الأداة" اختيار استدعاء الدوال كأدوات خارجية قبل توليد الإجابة. عند تمرير الأدوات إلى نموذج استخدام الأدوات، يمكنك ببساطة تمرير قائمة من الوظائف إلى معامل `tools`: + +```python +import datetime + +def current_time(): + """Get the current local time as a string.""" + return str(datetime.now()) + +def multiply(a: float, b: float): + """ + A function that multiplies two numbers + + Args: + a: The first number to multiply + b: The second number to multiply + """ + return a * b + +tools = [current_time, multiply] + +model_input = tokenizer.apply_chat_template( + messages, + tools=tools +) +``` + +لكي يعمل هذا بشكل صحيح، يجب عليك كتابة وظائفك بالتنسيق السابق، حتى يمكن تحليلها بشكل صحيح كأدوات. على وجه التحديد، يجب عليك اتباع هذه القواعد: + +- يجب أن يكون للدالة اسم وصفي. +- يجب أن يكون لكل معامل نوع للتلميح. +- يجب أن تحتوي الدالة على سلسلة مستندية بتنسيق Google القياسي (بمعنى وصف الدالة الأولي متبوعًا بكتلة `Args:` التي تصف المعاﻻت، ما لم تكن الدالة لا تحتوي على أي معامﻻت. +- لا تقم بتضمين الأنواع في كتلة `Args:` . بعبارة أخرى، اكتب `a: The first number to multiply`، وليس `a (int): The first number to multiply`. يجب أن تذهب تلميحات الأنواع في رأس الدالة بدلاً من ذلك. +- يمكن أن يكون للدالة نوع للإرجاع ومربع `Returns:` في السلسلة. ومع ذلك، فهذه اختيارية لأن معظم نماذج استخدام الأدوات تتجاهلها. + +### تمرير نتائج الأداة إلى النموذج + +يكفي الكود السابقة لسرد الأدوات المتاحة لنموذجك، ولكن ماذا يحدث إذا أراد النموذج استخدام واحدة منها؟ إذا حدث ذلك، فيجب عليك: + +1. تحليل مخرجات النموذج للحصول على اسم (أسماء) الأدوات ومعامﻻتها. +2. أضف استدعاء (استدعاءات) النموذج لِلأدوات إلى المحادثة. +3. استدعاء الدالة (الدالات) المقابلة بتلك المعامﻻت. +4. أضف النتيجة (النتائج) إلى المحادثة + +### مثال كامل على استخدام الأداة + + +سنستعرض مثالاً على استخدام الأدوات خطوة بخطوة . في هذا المثال ، سنستخدم نموذج `Hermes-2-Pro` بحجم 8 مليارات معامل ، نظرًا لأنه أحد أعلى نماذج استخدام الأدوات أداءً في فئة حجمه وقت كتابة هذا النص . إذا كان لديك الذاكرة الكافية ، فيمكنك النظر في استخدام نموذج أكبر بدلاً من ذلك مثل `Command-R` أو `Mixtral-8x22B` ، وكلاهما يدعم استخدام الأدوات ويوفر أداءً أقوى . + + +أولاً ، لنقم بتحميل نموذجنا و tokenizer الخاص بنا: + +```python +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +checkpoint = "NousResearch/Hermes-2-Pro-Llama-3-8B" + +tokenizer = AutoTokenizer.from_pretrained(checkpoint) +model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, device_map="auto") + +```python +messages = [ + {"role": "system", "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location."}, + {"role": "user", "content": "Hey, what's the temperature in Paris right now?"} +] +``` + +الآن، لنقم نطبق قالب الدردشة ونولد رد: + +```python +inputs = tokenizer.apply_chat_template(messages, chat_template="tool_use", tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt") +inputs = {k: v.to(model.device) for k, v in inputs.items()} +out = model.generate(**inputs, max_new_tokens=128) +print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):])) +``` + +ونحصل على: + +```text + +{"arguments": {"location": "Paris, France", "unit": "celsius"}, "name": "get_current_temperature"} +<|im_end|> +``` + +لقد قام النموذج باستدعاء الدالة مع معامﻻت صحيحة، بالصيغة التي طلبتها توثيق الدالة. لقد استنتج أننا نشير على الأرجح إلى باريس في فرنسا، وتذكر أنه بكونها موطن وحدات القياس الدولية، يجب عرض درجة الحرارة في فرنسا بالدرجة المئوية. + +دعنا نضيف استدعاء الأداة الخاص بالنموذج إلى المحادثة. لاحظ أننا نولد معرف استدعاء أداة عشوائيًا هنا. لا تستخدم جميع النماذج هذه المعرفات، ولكنها تسمح للنماذج بإصدار عدة استدعاءات للأدوات في نفس الوقت وتتبع الاستجابة المقابلة لكل استدعاء. يمكنك توليد هذه المعرفات بأي طريقة تريدها، ولكن يجب أن تكون فريدة داخل كل محادثة. + +```python +tool_call_id = "vAHdf3" # Random ID, should be unique for each tool call +tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}} +messages.append({"role": "assistant", "tool_calls": [{"id": tool_call_id, "type": "function", "function": tool_call}]}) +``` + +الآن بعد أن أضفنا استدعاء الأداة إلى المحادثة، يمكننا استدعاء الدالة وإضافة النتيجة إلى المحادثة. نظرًا لأننا نستخدم دالة وهمية لهذا المثال والتي تعيد دائمًا 22.0، فيمكننا ببساطة إضافة تلك النتيجة مباشرةً. لاحظ معرف استدعاء الأداة - يجب أن يتطابق مع المعرف المستخدم في استدعاء الأداة أعلاه. + +```python +messages.append({"role": "tool", "tool_call_id": tool_call_id, "name": "get_current_temperature", "content": "22.0"}) +``` + +أخيرًا، دعنا نجعل المساعد يقرأ مخرجات الدالة ويكمل الدردشة مع المستخدم: + +```python +inputs = tokenizer.apply_chat_template(messages, chat_template="tool_use", tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt") +inputs = {k: v.to(model.device) for k, v in inputs.items()} +out = model.generate(**inputs, max_new_tokens=128) +print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):])) +``` + +ونحصل على: + +```text +The current temperature in Paris, France is 22.0 ° Celsius.<|im_end|> +``` + + +لا تستخدم جميع نماذج استخدام الأدوات جميع ميزات استدعاء الأدوات الموضحة أعلاه. يستخدم البعض معرفات استدعاء الأدوات، بينما يستخدم البعض الآخر ببساطة اسم الدالة ويقارن استدعاءات الأدوات بالنتائج باستخدام الترتيب، وهناك عدة نماذج لا تستخدم أيًا منهما ولا تصدر سوى استدعاء أداة واحد في كل مرة لتجنب الارتباك. إذا كنت تريد أن يكون رمزك متوافقًا مع أكبر عدد ممكن من النماذج، فإننا نوصي بهيكلة استدعاءات الأدوات الخاصة بك كما هو موضح هنا، وإعادة نتائج الأدوات بالترتيب الذي أصدرها النموذج. يجب أن تتعامل قوالب الدردشة على كل نموذج مع الباقي. + + +### فهم مخططات الأدوات + +يتم تحويل كل دالة تقوم بتمريرها إلى معامل `tools` في دالة `apply_chat_template` إلى [مخطط JSON](https://json-schema.org/learn/getting-started-step-by-step). يتم بعد ذلك تمرير هذه المخططات إلى قالب الدردشة النموذج. وبعبارة أخرى، فإن نماذج استخدام الأدوات لا ترى دوالك مباشرة، ولا ترى مطلقًا الكود الموجود بداخلها. ما يهمها هو**تعريفات** الدوال و**المعامﻻت** التي تحتاج إلى تمريرها إليها - فهي تهتم بما تفعله الأدوات وكيفية استخدامها، وليس بكيفية عملها! يقع على عاتقك قراءة مخرجاتها، والكشف عما إذا كانت قد طلبت استخدام أداة، وتمرير المعامﻻت إلى دالة الأداة، وإرجاع الرد في الدردشة. + +يجب أن يكون إنشاء مخططات JSON لتمريرها إلى القالب تلقائيًا وغير مرئي طالما أن دوالك تتبع المواصفات الموضحة أعلاه، ولكن إذا واجهت مشكلات، أو إذا كنت تريد ببساطة مزيدًا من التحكم في التحويل، فيمكنك التعامل مع التحويل يدويًا. فيما يلي مثال على تحويل مخطط يدوي: + +```python +from transformers.utils import get_json_schema + +def multiply(a: float, b: float): + """ + A function that multiplies two numbers + + Args: + a: The first number to multiply + b: The second number to multiply + """ + return a * b + +schema = get_json_schema(multiply) +print(schema) +``` + +سيؤدي هذا إلى ما يلي: + +```json +{ + "type": "function", + "function": { + "name": "multiply", + "description": "A function that multiplies two numbers", + "parameters": { + "type": "object", + "properties": { + "a": { + "type": "number", + "description": "The first number to multiply" + }, + "b": { + "type": "number", + "description": "The second number to multiply" + } + }, + "required": ["a", "b"] + } + } +} +``` + +إذا كنت ترغب في ذلك، يمكنك تحرير هذه المخططات، أو حتى كتابتها من البداية بنفسك دون استخدام `get_json_schema` على الإطلاق. يمكن تمرير مخططات JSON مباشرةً إلى معامل `tools` في `apply_chat_template` - يمنحك هذا الكثير من القوة لتعريف مخططات دقيقة لوظائف أكثر تعقيدًا. ولكن كن حذرًا - كلما زاد تعقيد مخططاتك، زاد احتمال ارتباك النموذج عند التعامل معها! نوصي بتوقيعات دوال بسيطة حيثما أمكن، مع تقليل المعامﻻت (وخاصة المعامﻻت المعقدة والمتداخلة) إلى الحد الأدنى. + +فيما يلي مثال على تعريف المخططات يدويًا، وتمريرها مباشرةً إلى `apply_chat_template`: + +```python +# A simple function that takes no arguments +current_time = { + "type": "function", + "function": { + "name": "current_time", + "description": "Get the current local time as a string.", + "parameters": { + 'type': 'object', + 'properties': {} + } + } +} + +# A more complete function that takes two numerical arguments +multiply = { + 'type': 'function', + 'function': { + 'name': 'multiply', + 'description': 'A function that multiplies two numbers', + 'parameters': { + 'type': 'object', + 'properties': { + 'a': { + 'type': 'number', + 'description': 'The first number to multiply' + }, + 'b': { + 'type': 'number', 'description': 'The second number to multiply' + } + }, + 'required': ['a', 'b'] + } + } +} + +model_input = tokenizer.apply_chat_template( + messages, + tools = [current_time, multiply] +) +``` + +## متقدم: توليد قائم على الاسترجاع +يمكن لنماذج اللغة الكبيرة من نوع "توليد قائم على الاسترجاع" أو "RAG" البحث في مجموعة نصوص عن معلومات قبل الرد على الاستعلام. يسمح هذا للنماذج بتوسيع قاعدة معارفها بشكل كبير إلى ما هو أبعد من حجم سياقها المحدود. توصيتنا لنماذج RAG هي أن يقبل قالبها وسيطة `documents`. يجب أن تكون هذه قائمة من المستندات، حيث يكون كل "مستند" عبارة عن قاموس واحد بمفاتيح `title` و `contents`، وكلاهما سلاسل نصية. نظرًا لأن هذا التنسيق أبسط بكثير من مخططات JSON المستخدمة للأدوات، فلا توجد حاجة إلى دوال مساعدة. + +فيما يلي مثال على قالب RAG بالفعل: + +```python +from transformers import AutoTokenizer, AutoModelForCausalLM + +# تحميل النموذج والمجزىء اللغوي +model_id = "CohereForAI/c4ai-command-r-v01-4bit" +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto") +device = model.device # الحصول على الجهاز الذي تم تحميل النموذج عليه + +# تعريف مُدخلات المحادثة +conversation = [ + {"role": "user", "content": "What has Man always dreamed of?"} +] + +# تعريف المستندات لتوليد قائم على الاسترجاع +documents = [ + { + "title": "The Moon: Our Age-Old Foe", + "text": "Man has always dreamed of destroying the moon. In this essay, I shall..." + }, + { + "title": "The Sun: Our Age-Old Friend", + "text": "Although often underappreciated, the sun provides several notable benefits..." + } +] +# معالجة المحادثة والمستندات باستخدام قالب RAG، وإرجاع موترات PyTorch. +input_ids = tokenizer.apply_chat_template( + conversation=conversation, + documents=documents, + chat_template="rag", + tokenize=True, + add_generation_prompt=True, + return_tensors="pt").to(device) + +# توليد الرد +gen_tokens = model.generate( + input_ids, + max_new_tokens=100, + do_sample=True, + temperature=0.3, + ) + +# فك تشفير النص المُوَلّد وطباعته +gen_text = tokenizer.decode(gen_tokens[0]) +print(gen_text) +``` +إن مُدخل documents للتوليد القائم على الاسترجاع غير مدعوم على نطاق واسع، والعديد من النماذج لديها قوالب دردشة تتجاهل هذا المُدخل ببساطة. + +للتحقق مما إذا كان النموذج يدعم مُدخل `documents`، يمكنك قراءة بطاقة النموذج الخاصة به، أو `print(tokenizer.chat_template)` لمعرفة ما إذا كان مفتاح `documents` مستخدمًا في أي مكان. + +ومع ذلك، فإن أحد فئات النماذج التي تدعمه هي [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) و [Command-R+](https://huggingface.co/CohereForAI/c4ai-command-r-pluse-08-2024) من Cohere، من خلال قالب الدردشة rag الخاص بهم. يمكنك رؤية أمثلة إضافية على التوليد باستخدام هذه الميزة في بطاقات النموذج الخاصة بهم. + + +## متقدم: كيف تعمل قوالب الدردشة؟ +يتم تخزين قالب الدردشة للنموذج في الخاصية `tokenizer.chat_template`. إذا لم يتم تعيين قالب دردشة، فسيتم استخدام القالب الافتراضي لفئة النموذج هذه بدلاً من ذلك. دعونا نلقي نظرة على قالب دردشة `Zephyr`، ولكن لاحظ أن هذا القالب مُبسّط قليلاً عن القالب الفعلي! + +``` +{%- for message in messages %} + {{- '<|' + message['role'] + |>\n' }} + {{- message['content'] + eos_token }} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|assistant|>\n' }} +{%- endif %} +``` +إذا لم تكن قد رأيت أحد هذه القوالب من قبل، فهذا [قالب Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) .Jinja هي لغة قوالب تسمح لك بكتابة تعليمات برمجية بسيطة تُوَلّد نصًا. من نواحٍ عديدة، يُشبه الرمز والتركيب للغة Python. أما في لغة Python، سيبدو هذا القالب كما يلي: + +```python +for message in messages: + print(f'<|{message["role"]}|>') + print(message['content'] + eos_token) +if add_generation_prompt: + print('<|assistant|>') +``` +يقوم القالب بثلاثة أشياء بشكل فعال: + +- لكل رسالة، بطبع الدور مُحاطًا بـ `<|` و `|>`، مثل `<|user|>` أو `<|assistant|>`. +- بعد ذلك، يطبع محتوى الرسالة، متبوعًا برمز نهاية التسلسل `eos_token` . +- أخيرًا، إذا تم تعيين `add_generation_prompt` ، يطبع الرمز المساعد، حتى يعرف النموذج أنه يجب أن يبدأ في توليد استجابة المساعد. + +هذا قالب بسيط جدًا، لكن Jinja تمنحك الكثير من المرونة للقيام بأشياء أكثر تعقيدًا! دعونا نرى قالب Jinja يُمكنه تنسيق المُدخلات بطريقة تُشبه الطريقة التي تُنسّق بها LLaMA مُدخلاتها (لاحظ أن قالب LLaMA الحقيقي يتضمن معالجة لرسائل النظام الافتراضية ومعالجة رسائل النظام بشكل مختلف قليلاً بشكل عام - لا تستخدم هذا القالب في التعليمات البرمجية الفعلية الخاصة بك!) +``` +{%- for message in messages %} + {%- if message['role'] == 'user' %} + {{- bos_token + '[INST] ' + message['content'] + ' [/INST]' }} + {%- elif message['role'] == 'system' %} + {{- '<>\\n' + message['content'] + '\\n<>\\n\\n' }} + {%- elif message['role'] == 'assistant' %} + {{- ' ' + message['content'] + ' ' + eos_token }} + {%- endif %} +{%- endfor %} +``` +نأمل أنه إذا حدقت في هذا لفترة قصيرة، يمكنك أن ترى ما يفعله هذا القالب - فهو يُضيف رموزًا مُحددة مثل `[INST]` و `[/INST]` بناءً على دور كل رسالة. يمكن تمييز رسائل المستخدم والمساعد والنظام بوضوح للنموذج بسبب الرموز التي تُحيط بها. + +## متقدم: إضافة وتعديل قوالب الدردشة + +### كيف أنشئ قالب دردشة؟ +ببساطة، اكتب قالب Jinja واضبط `tokenizer.chat_template`. قد تجد أنه من الأسهل البدء بقالب موجود من نموذج آخر وتحريره ببساطة ليناسب احتياجاتك! على سبيل المثال، يمكننا أن نأخذ قالب LLaMA أعلاه ونضيف `[ASST]` و `[/ASST]` إلى رسائل المساعد: + +``` +{%- for message in messages %} + {%- if message['role'] == 'user' %} + {{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }} + {%- elif message['role'] == 'system' %} + {{- '<>\\n' + message['content'].strip() + '\\n<>\\n\\n' }} + {%- elif message['role'] == 'assistant' %} + {{- '[ASST] ' + message['content'] + ' [/ASST]' + eos_token }} + {%- endif %} +{%- endfor %} +``` + +الآن، اضبط ببساطة الخاصية `tokenizer.chat_template`. في المرة القادمة التي تستخدم فيها [`~PreTrainedTokenizer.apply_chat_template`] ، سيستخدم القالب الجديد الخاص بك! سيتم حفظ هذه الخاصية في ملف `tokenizer_config.json`، حتى تتمكن من استخدام [`~utils.PushToHubMixin.push_to_hub`] لتحميل قالبك الجديد إلى Hub والتأكد من أن الجميع يستخدم القالب الصحيح لنموذجك! + +```python +template = tokenizer.chat_template +template = template.replace("SYS", "SYSTEM") # تغيير رمز النظام +tokenizer.chat_template = template # تعيين القالب الجديد +tokenizer.push_to_hub("model_name") # تحميل القالب الجديد إلى Hub! +``` + +يتم استدعاء الدالة [`~PreTrainedTokenizer.apply_chat_template`] الذي نستخدم قالب الدردشة الخاص بك بواسطة فئة [`TextGenerationPipeline`] لذلك بمجرد تعيين قالب الدردشة الصحيح، سيصبح نموذجك متوافقًا تلقائيًا مع [`TextGenerationPipeline`]. + + +إذا كنت تُجري ضبطًا دقيقًا لنموذج للدردشة، بالإضافة إلى تعيين قالب دردشة، فربما يجب عليك إضافة أي رموز تحكم دردشة جديدة كرموز خاصة في المجزىء اللغوي. لا يتم تقسيم الرموز الخاصة أبدًا، مما يضمن معالجة رموز التحكم الخاصة بك دائمًا كرموز فردية بدلاً من تجزئتها إلى أجزاء. يجب عليك أيضًا تعيين خاصية `eos_token` للمجزىء اللغوي إلى الرمز الذي يُشير إلى نهاية توليدات المساعد في قالبك. سيضمن هذا أن أدوات توليد النصوص يمكنها تحديد وقت إيقاف توليد النص بشكل صحيح. + + +### لماذا تحتوي بعض النماذج على قوالب متعددة؟ +تستخدم بعض النماذج قوالب مختلفة لحالات استخدام مختلفة. على سبيل المثال، قد تستخدم قالبًا واحدًا للدردشة العادية وآخر لاستخدام الأدوات، أو التوليد القائم على الاسترجاع. في هذه الحالات، تكون `tokenizer.chat_template` قاموسًا. يمكن أن يتسبب هذا في بعض الارتباك، وحيثما أمكن، نوصي باستخدام قالب واحد لجميع حالات الاستخدام. يمكنك استخدام عبارات Jinja مثل `if tools is defined` وتعريفات `{% macro %}` لتضمين مسارات تعليمات برمجية متعددة بسهولة في قالب واحد. + +عندما يحتوي المعالج اللغوي على قوالب متعددة، ستكون `tokenizer.chat_template dict`، حيث يكون كل مفتاح هو اسم قالب. يحتوي أسلوب `apply_chat_template` على معالجة خاصة لأسماء قوالب مُعينة: على وجه التحديد، سيبحث عن قالب باسم `default` في معظم الحالات، وسيُثير خطأً إذا لم يتمكن من العثور على واحد. ومع ذلك، إذا كان هناك قالب باسم `tool_use` عندما قام المستخدم بتمرير وسيطة `tools`، فسيستخدم هذا القالب بدلاً من ذلك. للوصول إلى قوالب بأسماء أخرى، مرر اسم القالب الذي تُريده إلى وسيطة `chat_template` لـ `apply_chat_template()`. + +نجد أن هذا قد يكون مُربكًا بعض الشيء للمستخدمين - لذلك إذا كنت تكتب قالبًا بنفسك، فننصحك بمحاولة وضعه كله في قالب واحد حيثما أمكن! + +## ما القالب الذي يجب أن أستخدمه؟ + +عند تعيين قالب لنموذج تم تدريبه بالفعل على الدردشة، يجب التأكد من أن القالب يتطابق تمامًا مع تنسيق الرسالة الذي شاهده النموذج أثناء التدريب، وإلا فمن المحتمل أن تواجه تدهورًا في الأداء. هذا صحيح حتى إذا كنت تدرب النموذج بشكل إضافي - فمن المحتمل أن تحصل على أفضل أداء إذا قمت بإبقاء رموز الدردشة ثابتة. يُشبه هذا إلى حد كبير عملية التجزئة - فأنت تحصل بشكل عام على أفضل أداء للاستدلال أو الضبط الدقيق عندما تتطابق بدقة مع التجزئة المستخدمة أثناء التدريب. + +من ناحية أخرى، إذا كنت تُدرّب نموذجًا من البداية، أو تقوم بضبط دقيق لنموذج لغة أساسي للدردشة، لديك حرية اختيار قالب مناسب! تتمتع LLMs بالذكاء الكافي للتعامل مع العديد من تنسيقات الإدخال المختلفة. أحد الخيارات الشائعة هو تنسيق "ChatML"، وهو خيار جيد ومرن للعديد من حالات الاستخدام. يبدو كالتالي: + +``` +{%- for message in messages %} + {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }} +{%- endfor %} +``` + +إذا أعجبك هذا، فإليك نسخة جاهزة لوضعها في كودك. يتضمن الخط المفرد أيضًا دعمًا مفيدًا [لإرشادات التوليد](#what-are-generation-prompts)، ولكن لاحظ أنه لا يضيف رموز BOS أو EOS! إذا كان نموذجك يتوقع هذه الرموز، فلن يتم إضافتها تلقائيًا بواسطة "apply_chat_template" - بمعنى آخر، سيتم تجزئة النص باستخدام "add_special_tokens=False". هذا لتجنب التعارضات المحتملة بين القالب ومنطق "add_special_tokens". إذا كان نموذجك يتوقع رموزًا خاصة، فتأكد من إضافتها إلى القالب! + +```python +tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" +``` + +يُحيط هذا القالب كل رسالة بين الرمزين "<|im_start|>" و "<|im_end|>"، ويكتب ببساطة الدور كسلسلة نصية، مما يسمح بالمرونة في الأدوار التي تتدرب عليها. يبدو الناتج كما يلي: + +```text +<|im_start|>system +You are a helpful chatbot that will do its best not to say anything so stupid that people tweet about it.<|im_end|> +<|im_start|>user +How are you?<|im_end|> +<|im_start|>assistant +I'm doing great!<|im_end|> +``` + +تعد أدوار "user" و "system" و "assistant" هي الأدوار القياسية للدردشة، ونوصي باستخدامها عندما يكون ذلك منطقيًا، خاصة إذا كنت تريد أن يعمل نموذجك بشكل جيد مع [`TextGenerationPipeline`]. ومع ذلك، فأنت لست مقيدًا بهذه الأدوار - فإن القوالب مرنة للغاية، ويمكن أن تكون أي سلسلة نصية دورًا. + + +## أريد إضافة بعض قوالب الدردشة! كيف أبدأ؟ + +إذا كان لديك أي نماذج دردشة، فيجب عليك تعيين الخاصية "tokenizer.chat_template" الخاصة بها واختبارها باستخدام [`~PreTrainedTokenizer.apply_chat_template`]، ثم رفع المجزىء اللغوي المُحدّث إلى Hub. ينطبق هذا حتى إذا لم تكن مالك النموذج - إذا كنت تستخدم نموذجًا بقالب دردشة فارغ، أو لا يزال يستخدم قالب الفئة الافتراضية، فيرجى فتح [طلب سحب](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) إلى مستودع النموذج حتى يمكن تعيين الخاصية بشكل صحيح! + +بمجرد تعيين الخاصية، هذا كل شيء، لقد انتهيت! ستعمل "tokenizer.apply_chat_template" الآن بشكل صحيح لهذا النموذج، مما يعني أنها مدعومة أيضًا بشكل تلقائي في أماكن مثل "TextGenerationPipeline"! + +من خلال ضمان امتلاك النماذج لهذه الخاصية، يُمكننا التأكد من أن المجتمع بأكمله يستخدم القوة الكاملة للنماذج مفتوحة المصدر. لقد كانت عدم تطابق التنسيق تطارد المجال وأضرت الأداء بصمت لفترة طويلة جدًا - لقد حان الوقت لوضع حد لها! + +## متقدم: نصائح لكتابة القوالب + + +أسهل طريقة للبدء في كتابة قوالب Jinja هي إلقاء نظرة على بعض القوالب الموجودة. يمكنك استخدام `print(tokenizer.chat_template)` لأي نموذج دردشة لمعرفة القالب الذي يستخدمه. بشكل عام، تحتوي النماذج التي تدعم استخدام الأدوات على قوالب أكثر تعقيدًا بكثير من النماذج الأخرى - لذلك عندما تبدأ للتو، فمن المحتمل أنها مثال سيئ للتعلم منه! يمكنك أيضًا إلقاء نظرة على [وثائق Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/#synopsis) للحصول على تفاصيل حول تنسيق Jinja العام وتركيبه. + + + +تُطابق قوالب Jinja في `transformers` قوالب Jinja في أي مكان آخر. الشيء الرئيسي الذي يجب معرفته هو أن سجل الدردشة سيكون متاحًا داخل قالبك كمتغير يسمى `messages`. ستتمكن من الوصول إلى `messages` في قالبك تمامًا كما يمكنك في Python، مما يعني أنه يمكنك التكرار خلاله باستخدام `{% for message in messages %}` أو الوصول إلى رسائل فردية باستخدام `{{ messages[0] }}`، على سبيل المثال. + +يمكنك أيضًا استخدام النصائح التالية لكتابة قوالب Jinja نظيفة وفعالة: + +### إقتطاع المسافات الفارغة + +بشكل افتراضي، ستطبع Jinja أي مسافات فارغة تأتي قبل أو بعد كتلة. يمكن أن يكون هذا مشكلة لقوالب الدردشة، والتي تريد عادةً أن تكون دقيقة جدًا مع المسافات! لتجنب ذلك، نوصي بشدة بكتابة قوالبك على النحو التالي: + +``` +{%- for message in messages %} + {{- message['role'] + message['content'] }} +{%- endfor %} +``` + +بدلاً من ذلك: + +``` +{% for message in messages %} + {{ message['role'] + message['content'] }} +{% endfor %} +``` + +سيؤدي إضافة "-" إلى إزالة أي مسافات تأتي قبل الكتلة. يبدو المثال الثاني عادية، ولكن قد يتم تضمين السطر الجديد والمسافة البادئة في المخرجات، وهو على الأرجح ليس ما تُريده! + + +### المتغيرات الخاصة + + داخل قالبك، سيكون لديك حق الوصول إلى العديد من المتغيرات الخاصة. أهمها هو `messages`، والذي يحتوي على سجل الدردشة كقائمة من قواميس الرسائل. ومع ذلك، هناك العديد من المتغيرات الأخرى. لن يتم استخدام كل متغير في كل قالب. المتغيرات الأكثر شيوعًا هي: + +- `tools` تحتوي على قائمة بالأدوات بتنسيق مخطط JSON. ستكون `None` أو غير مُعرّفة إذا لم يتم تمرير أي أدوات. +- `documents` تحتوي على قائمة من المستندات بالتنسيق `{"title": "العنوان", "contents": "المحتويات"}`، تُستخدم للتوليد المُعزز بالاسترجاع. ستكون `None` أو غير مُعرّفة إذا لم يتم تمرير أي مستندات. +- `add_generation_prompt` هي قيمة منطقية تكون `True` إذا طلب المستخدم مُطالبة توليد، و `False` بخلاف ذلك. إذا تم تعيين هذا، فيجب أن يُضيف قالبك رأس رسالة مساعد إلى نهاية المحادثة. إذا لم يكن لدى نموذجك رأس مُحدد لرسائل المساعد، فيمكنك تجاهل هذا العلم. +- **الرموز الخاصة** مثل `bos_token` و `eos_token`. يتم استخراجها من `tokenizer.special_tokens_map`. ستختلف الرموز الدقيقة المتاحة داخل كل قالب اعتمادًا على المجزىء اللغوي الأصلي. + + + + +يمكنك في الواقع تمرير أي `kwarg` إلى `apply_chat_template`، وستكون متاحة داخل القالب كمتغير. بشكل عام، نوصي بمحاولة الالتزام بالمتغيرات الأساسية المذكورة أعلاه، لأن ذلك سيجعل نموذجك أكثر صعوبة في الاستخدام إذا كان على المستخدمين كتابة تعليمات برمجية مخصصة لتمرير `kwargs` خاصة بالنموذج. ومع ذلك، فنحن نُدرك أن هذا المجال يتحرك بسرعة، لذلك إذا كانت لديك حالة استخدام جديدة لا تتناسب مع واجهة برمجة التطبيقات الأساسية، فلا تتردد في استخدام `kwarg` معامل جديد لها! إذا أصبح `kwarg` المعامل الجديد شائعًا، فقد نقوم بترقيته إلى واجهة برمجة التطبيقات الأساسية وإنشاء وتوثيق الخاص به. + + + +### دوال قابلة للاستدعاء + +هناك أيضًا قائمة قصيرة من الدوال القابلة للاستدعاء المتاحة لك داخل قوالبك. هذه هي: + +- `raise_exception(msg)`: تُثير `TemplateException`. هذا مفيد لتصحيح الأخطاء، ولإخبار المستخدمين عندما يفعلون شيئًا لا يدعمه قالبك. +- `strftime_now(format_str)`: تُكافئ `datetime.now().strftime(format_str)` في Python. يُستخدم هذا للحصول على التاريخ/الوقت الحالي بتنسيق مُحدد، والذي يتم تضمينه أحيانًا في رسائل النظام. + +### التوافق مع Jinja غير Python + +هناك تطبيقات متعددة لـ Jinja بلغات مختلفة. عادة ما يكون لها نفس التركيب، ولكن الاختلاف الرئيسي هو أنه عند كتابة قالبًا في Python، يمكنك استخدام أساليب Python، مثل ".lower()" على السلاسل أو ".items()" على القواميس. سيؤدي هذا إلى كسر إذا حاول شخص ما استخدام قالبك في تنفيذ غير Python لـ Jinja. تعد التطبيقات غير Python شائعة بشكل خاص في بيئات النشر، حيث تعد JS و Rust شائعة جدًا. + +لا تقلق، على الرغم من ذلك! هناك بعض التغييرات البسيطة التي يمكنك إجراؤها على قوالبك لضمان توافقها عبر جميع تطبيقات Jinja: + +- استبدل أساليب Python بمرشحات Jinja. عادة ما يكون لها نفس الاسم، على سبيل المثال، يصبح "string.lower()" عبارة عن "string|lower"، ويصبح "dict.items()" عبارة عن "dict|items". أحد التغييرات الملحوظة هو أن "string.strip()" يصبح "string|trim". راجع [قائمة المرشحات المدمجة](https://jinja.palletsprojects.com/en/3.1.x/templates/#builtin-filters) في وثائق Jinja لمزيد من المعلومات. +- استبدل "True" و "False" و "None"، وهي خاصة بـ Python، بـ "true" و "false" و "none". +- قد يؤدي عرض قاموس أو قائمة مباشرة إلى نتائج مختلفة في التطبيقات الأخرى (على سبيل المثال، قد تتغير مدخﻻت السلسلة النصية من علامات اقتباس مفردة ' إلى علامات اقتباس مزدوجة "). يمكن أن يساعد إضافة "tojson" في ضمان الاتساق هنا. + +## كتابة مطالبات التوليد +لقد ذكرنا أعلاه أن add_generation_prompt هو متغير خاص يمكن الوصول إليه داخل قالبك، ويتحكم فيه المستخدم من خلال تعيين معامل add_generation_prompt. إذا كان نموذجك يتوقع عنوان لرسائل المساعد، فيجب أن يدعم قالبك إضافة العنوان عند تعيين add_generation_prompt. + +فيما يلي مثال على قالب يُنسّق الرسائل بأسلوب ChatML، مع دعم مُطالبة التوليد: + +```text +{{- bos_token }} +{%- for message in messages %} + {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} +``` +سيعتمد المحتوى الدقيق لعنوان المساعد على نموذجك المُحدد، ولكن يجب أن يكون دائمًا السلسلة النصية التي تُمثل بداية رسالة المساعد، بحيث إذا قام المستخدم بتطبيق قالبك باستخدام add_generation_prompt=True ثم قام بتوليد نص، سيكتب النموذج استجابة المساعد. لاحظ أيضًا أن بعض النماذج لا تحتاج إلى مُطالبة توليد، لأن رسائل المساعد تبدأ دائمًا فورًا بعد رسائل المستخدم. هذا شائع بشكل خاص لنماذج LLaMA و Mistral، حيث تبدأ رسائل المساعد فورًا بعد رمز [/INST] الذي ينهي رسائل المستخدم. في هذه الحالات، يمكن للقالب تجاهل معامل add_generation_prompt. + +مُطالبات التوليد مُهمة! إذا كان نموذجك يتطلب مُطالبة توليد ولكنها غير مُعيّنة في القالب، فمن المُحتمل أن تتدهور عمليات توليد النموذج بشدة، أو قد يُظهر النموذج سلوكًا غير عادي مثل متابعة رسالة المستخدم الأخيرة! + +### كتابة قوالب أكبر وتصحيحها +عندما تم تقديم هذه الميزة، كانت معظم القوالب صغيرة جدًا، أي ما يُعادل نص برمجي "من سطر واحد" في Jinja. ومع ذلك، مع النماذج والميزات الجديدة مثل استخدام الأدوات و RAG، يمكن أن يصل طول بعض القوالب إلى 100 سطر أو أكثر. عند كتابة قوالب كهذه، من الجيد كتابتها في ملف مُنفصل، باستخدام مُحرر نصوص. يمكنك بسهولة استخراج قالب دردشة إلى ملف: + +```python +open("template.jinja", "w").write(tokenizer.chat_template) +``` +أو تحميل القالب المُحرر مرة أخرى إلى المعالج اللغوي: + +```python +tokenizer.chat_template = open("template.jinja").read() +``` +كميزة إضافية، عندما تكتب قالبًا طويلاً متعدد الأسطر في ملف مُنفصل، ستتوافق أرقام الأسطر في هذا الملف تمامًا مع أرقام الأسطر في أخطاء تحليل القالب أو تنفيذه. سيُسهّل هذا كثيرًا تحديد مكان المشكلات. + +### كتابة قوالب للأدوات +على الرغم من أن قوالب الدردشة لا تفرض واجهة برمجة تطبيقات مُحددة للأدوات (أو لأي شيء حقًا)، فإننا نوصي مؤلفي القوالب بمحاولة الالتزام بواجهة برمجة تطبيقات قياسية حيثما أمكن. الهدف النهائي لقوالب الدردشة هو السماح بنقل التعليمات البرمجية عبر النماذج، لذا فإن الانحراف عن واجهة برمجة تطبيقات الأدوات القياسية يعني أن المستخدمين سيضطرون إلى كتابة تعليمات برمجية مخصصة لاستخدام الأدوات مع نموذجك. في بعض الأحيان يكون ذلك أمرًا لا مفر منه، ولكن غالبًا ما يكون من الممكن استخدام واجهة برمجة التطبيقات القياسية من خلال استخدام قوالب ذكية! + +أدناه، سنُدرج عناصر واجهة برمجة التطبيقات القياسية، ونقدم نصائح حول كتابة قوالب ستعمل بشكل جيد معها. + +#### تعريفات الأدوات +يجب أن يتوقع قالبك أن يكون المتغير tools إما فارغًا (إذا لم يتم تمرير أي أدوات)، أو قائمة من قواميس مخطط JSON. تسمح أساليب قالب الدردشة الخاصة بنا للمستخدمين بتمرير الأدوات إما كمخطط JSON أو كدوال Python، ولكن عندما يتم تمرير الدوال، فإننا نقوم تلقائيًا بإنشاء مخطط JSON وتمريره إلى قالبك. نتيجة لذلك، سيكون متغير tools الذي يستقبله قالبك دائمًا قائمة من مخططات JSON. هنا مخطط JSON أداة نموذجي: + +```json +{ + "type": "function", + "function": { + "name": "multiply", + "description": "دالة تضرب عددين", + "parameters": { + "type": "object", + "properties": { + "a": { + "type": "number", + "description": "الرقم الأول للضرب" + }, + "b": { + "type": "number", + "description": "الرقم الثاني للضرب" + } + }, + "required": ["a", "b"] + } + } +} +``` + +وهنا بعض الأمثلة البرمجية للتعامل مع الأدوات في قالب الدردشة الخاص بك. تذكر أن هذا مجرد مثال لتنسيق مُحدد - من المحتمل أن يحتاج نموذجك إلى تنسيق مختلف! +```text +{%- if tools %} + {%- for tool in tools %} + {{- '' + tool['function']['name'] + '\n' }} + {%- for argument in tool['function']['parameters']['properties'] %} + {{- argument + ': ' + tool['function']['parameters']['properties'][argument]['description'] + '\n' }} + {%- endfor %} + {{- '\n' }} + {%- endif %} +{%- endif %} +``` + +يجب بالطبع اختيار الرموز المحددة ووصف الأدوات التي يُعرضها قالبك لتتناسب مع تلك التي تم تدريب نموذجك عليها. لا يوجد شرط أن يفهم نموذجك مُدخلات مخطط JSON، فقط أن يتمكن قالبك من ترجمة مخطط JSON إلى تنسيق نموذجك. على سبيل المثال، تم تدريب Command-R باستخدام أدوات مُعرّفة باستخدام رؤوس دوال Python، ولكن يقبل قالب أداة Command-R مخطط JSON، ويُحوّل الأنواع داخليًا ويُعرض أدوات الإدخال كعناوين Python. يمكنك فعل الكثير باستخدام القوالب! + +#### استدعاءات الأدوات +استدعاءات الأدوات، إذا كانت موجودة، ستكون قائمة مُرفقة برسالة بدور "assistant". لاحظ أن tool_calls هي دائمًا قائمة، على الرغم من أن معظم نماذج استدعاء الأدوات تدعم فقط استدعاءات أدوات فردية في كل مرة، مما يعني أن القائمة ستحتوي عادةً على عنصر واحد فقط. هنا قاموس رسالة نموذجي يحتوي على استدعاء أداة: + +```json +{ + "role": "assistant", + "tool_calls": [ + { + "type": "function", + "function": { + "name": "multiply", + "arguments": { + "a": 5, + "b": 6 + } + } + } + ] +} +``` +والنمط الشائع للتعامل معها سيكون كهذا: + +```text +{%- if message['role'] == 'assistant' and 'tool_calls' in message %} + {%- for tool_call in message['tool_calls'] %} + {{- '' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n' }} + {%- endif %} + {%- endfor %} +{%- endif %} +``` + +مرة أخرى، يجب عليك عرض استدعاء الأداة بالتنسيق والرموز الخاصة التي يتوقعها نموذجك. + +#### استجابات الأدوات +استجابات الأدوات لها تنسيق بسيط: إنها قاموس رسالة بدور "tool"، ومفتاح "name" يُعطي اسم الدالة المُستدعاة، ومفتاح "content" يحتوي على نتيجة استدعاء الأداة. هنا استجابة أداة نموذجية: + +```json +{ + "role": "tool", + "name": "multiply", + "content": "30" +} +``` +لست بحاجة إلى استخدام جميع المفاتيح في استجابة الأداة. على سبيل المثال، إذا كان نموذجك لا يتوقع تضمين اسم الدالة في استجابة الأداة، فيمكن أن يكون عرضها بسيطًا مثل: + +```text +{%- if message['role'] == 'tool' %} + {{- "" + message['content'] + "" }} +{%- endif %} +``` + +مرة أخرى، تذكر أن التنسيق الفعلي والرموز الخاصة خاصة بالنموذج - يجب أن تُولي عناية كبيرة لضمان أن الرموز والمسافات الفارغة وكل شيء آخر يتطابق تمامًا مع التنسيق الذي تم تدريب نموذجك عليه! diff --git a/docs/source/ar/create_a_model.md b/docs/source/ar/create_a_model.md new file mode 100644 index 000000000000..6b511fe0de4a --- /dev/null +++ b/docs/source/ar/create_a_model.md @@ -0,0 +1,436 @@ +# إنشاء بنية مخصصة + +تحدد فئة [`AutoClass`](model_doc/auto) تلقائيًا بنية النموذج وتقوم بتنزيل تكوين وأوزان مسبقين للنموذج. بشكل عام، نوصي باستخدام `AutoClass` لإنتاج كود غير مرتبط بنسخة معينة. ولكن يمكن للمستخدمين الذين يريدون مزيدًا من التحكم في معلمات النموذج المحددة إنشاء نموذج مخصص من 🤗 Transformers من مجرد بضع فئات أساسية. قد يكون هذا مفيدًا بشكل خاص لأي شخص مهتم بدراسة نموذج 🤗 Transformers أو تدريبه أو إجراء تجارب عليه. في هذا الدليل، سنغوص بشكل أعمق في إنشاء نموذج مخصص بدون `AutoClass`. تعرف على كيفية: + +- تحميل تكوين النموذج وتخصيصه. +- إنشاء بنية نموذج. +- إنشاء مجزء لغوى سريع وبطيء للنص. +- إنشاء معالج صور لمهام الرؤية. +- إنشاء مستخرج ميزات لمهام الصوت. +- إنشاء معالج للمهام متعددة الوسائط. + +## التكوين + +يشير مصطلح [التكوين](main_classes/configuration) إلى الخصائص المحددة للنموذج. لكل تكوين نموذج خصائصه الخاصة؛ على سبيل المثال، تشترك جميع نماذج NLP في الخصائص `hidden_size` و`num_attention_heads` و`num_hidden_layers` و`vocab_size` المشتركة. تحدد هذه الخصائص عدد رؤوس الانتباه أو الطبقات المخفية لبناء نموذج بها. + +اطلع على [DistilBERT](model_doc/distilbert) من خلال [`DistilBertConfig`] لمعاينة خصائصه: + +```py +>>> from transformers import DistilBertConfig + +>>> config = DistilBertConfig() +>>> print(config) +DistilBertConfig { + "activation": "gelu", + "attention_dropout": 0.1, + "dim": 768, + "dropout": 0.1, + "hidden_dim": 3072, + "initializer_range": 0.02, + "max_position_embeddings": 512, + "model_type": "distilbert", + "n_heads": 12, + "n_layers": 6, + "pad_token_id": 0, + "qa_dropout": 0.1, + "seq_classif_dropout": 0.2, + "sinusoidal_pos_embds": false, + "transformers_version": "4.16.2", + "vocab_size": 30522 +} +``` + +يعرض [`DistilBertConfig`] جميع الخصائص الافتراضية المستخدمة لبناء نموذج [`DistilBertModel`] أساسي. جميع الخصائص قابلة للتعديل، مما ييتيح مجالاً للتجريب. على سبيل المثال، يمكنك تعديل نموذج افتراضي لـ: + +- تجربة دالة تنشيط مختلفة باستخدام معامل `activation`. +- استخدام معدل إسقاط أعلى الاحتمالات الانتباه مع معامل `attention_dropout`. + +```py +>>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4) +>>> print(my_config) +DistilBertConfig { + "activation": "relu", + "attention_dropout": 0.4, + +``` + +يمكن تعديل خصائص النموذج المدرب مسبقًا في دالة [`~PretrainedConfig.from_pretrained`] : + +```py +>>> my_config = DistilBertConfig.from_pretrained("distilbert/distilbert-base-uncased", activation="relu", attention_dropout=0.4) +``` + +بمجرد أن تصبح راضيًا عن تكوين نموذجك، يمكنك حفظه باستخدام [`~PretrainedConfig.save_pretrained`]. يتم تخزين ملف التكوين الخاص بك على أنه ملف JSON في دليل الحفظ المحدد: + +```py +>>> my_config.save_pretrained(save_directory="./your_model_save_path") +``` + +لإعادة استخدام ملف التكوين، قم بتحميله باستخدام [`~PretrainedConfig.from_pretrained`]: + +```py +>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json") +``` + + +يمكنك أيضًا حفظ ملف التكوين كقاموس أو حتى كفرق بين خصائص التكوين المُعدّلة والخصائص التكوين الافتراضية! راجع وثائق [التكوين](main_classes/configuration) لمزيد من التفاصيل. + + + +## النموذج + +الخطوة التالية هي إنشاء [نموذج](main_classes/models). النموذج - ويُشار إليه أحيانًا باسم البنية - يُحدد وظيفة كل طبقة والعمليات الحسابية المُنفذة. تُستخدم خصائص مثل `num_hidden_layers` من التكوين لتحديد هذه البنية. تشترك جميع النماذج في فئة أساسية واحدة هي [`PreTrainedModel`] وبعض الوظائف المُشتركة مثل غيير حجم مُدخلات الكلمات وتقليص رؤوس آلية الانتباه الذاتي. بالإضافة إلى ذلك، فإن جميع النماذج هي فئات فرعية إما من [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html)، [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) أو [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) . هذا يعني النماذج متوافقة مع كل استخدام لإطار العمل الخاص بها. + + + +قم بتحميل خصائص التكوين المخصصة الخاصة بك في النموذج: + +```py +>>> from transformers import DistilBertModel + +>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json") +>>> model = DistilBertModel(my_config) +``` + +هذا ينشئ نموذجًا بقيم عشوائية بدلاً من الأوزان المُدربة مسبقًا. لن يكون هذا النموذج مفيدًا حتى يتم تدريبه. تُعد عملية التدريب مكلفة وتستغرق وقتًا طويلاً. من الأفضل بشكل عام استخدام نموذج مُدرب مسبقًا للحصول على نتائج أفضل بشكل أسرع، مع استخدام جزء بسيط فقط من الموارد المطلوبة للتدريب. + +قم بإنشاء نموذج مُدرب مسبقًا باستخدام [`~PreTrainedModel.from_pretrained`]: + +```py +>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased") +``` + +عند بتحميل الأوزان المُدربة مسبقًا، يتم تحميل تكوين النموذج الافتراضي تلقائيًا إذا كان النموذج من مكتبة 🤗 Transformers. ومع ذلك، يمكنك أيضًا استبدال - بعض أو كل - سإعدادات النموذج الافتراضية بإعداداتك الخاصة: + +```py +>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased"، config=my_config) +``` + + +قم بتحميل خصائص التكوين المُخصصة الخاصة بك في النموذج: + +```py +>>> from transformers import TFDistilBertModel + +>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") +>>> tf_model = TFDistilBertModel(my_config) +``` + +هذا ينشئ نموذجًا بقيم عشوائية بدلاً من الأوزان المُدربة مسبقًا. لن يكون هذا النموذج مفيدًا حتى يتم تدريبه. تُعد عملية التدريب مكلفة وتستغرق وقتًا طويلاً. من الأفضل بشكل عام استخدام نموذج مُدرب مسبقًا للحصول على نتائج أفضل بشكل أسرع، مع استخدام جزء بسيط فقط من الموارد المطلوبة للتدريب. + +قم بإنشاء نموذج مُدرب مسبقًا باستخدام [`~TFPreTrainedModel.from_pretrained`]: + +```py +>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased") +``` + +عندما تقوم بتحميل الأوزان المُدربة مسبقًا،يتم تحميل إعدادات النموذج الافتراضي تلقائيًا إذا كان النموذج من مكتبة 🤗 Transformers. ومع ذلك، يمكنك أيضًا استبدال - بعض أو كل - إعدادات النموذج الافتراضية بإعداداتك الخاصة: + +```py +>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased"، config=my_config) +``` + + + +### رؤوس النموذج + +في هذه المرحلة، لديك نموذج DistilBERT الأساسي الذي يخرج *حالات الكامنة*. تُمرَّر هذه الحالات الكامنة كمدخلات لرأس النموذج لإنتاج المخرجات النهائية. توفر مكتبة 🤗 Transformers رأس نموذج مختلف لكل مهمة طالما أن النموذج يدعم المهمة (أي لا يمكنك استخدام DistilBERT لمهمة تسلسل إلى تسلسل مثل الترجمة). + + + +على سبيل المثال، [`DistilBertForSequenceClassification`] هو نموذج DistilBERT الأساس مزودًا برأس تصنيف تسلسلي. يُشكّل رأس التصنيف التسلسلي طبقة خطية فوق المخرجات المجمعة. + +```py +>>> from transformers import DistilBertForSequenceClassification + +>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") +``` + +أعد استخدام هذا نقطة التحقق هذه لمهمة أخرى بسهولة، وذلك بتغيير رأس النموذج.ففي مهمة الإجابة على الأسئلة، ستستخدم رأس النموذج [`DistilBertForQuestionAnswering`]. رأس الإجابة على الأسئلة مشابه لرأس التصنيف التسلسلي باستثناء أنه طبقة خطية فوق مخرجات الحالات الكامنة. + +```py +>>> from transformers import DistilBertForQuestionAnswering + +>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased") +``` + + +على سبيل المثال، [`TFDistilBertForSequenceClassification`] هو نموذج DistilBERT الأساسي برأس تصنيف تسلسل. رأس التصنيف التسلسلي هو طبقة خطية أعلى المخرجات المجمعة. + +```py +>>> from transformers import TFDistilBertForSequenceClassification + +>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") +``` + +أعد استخدام هذا نقطة التحقق لمهمة أخرى عن طريق التبديل إلى رأس نموذج مختلف. لمهمة الإجابة على الأسئلة، ستستخدم رأس النموذج [`TFDistilBertForQuestionAnswering`]. رأس الإجابة على الأسئلة مشابه لرأس التصنيف التسلسلي باستثناء أنه طبقة خطية أعلى حالات الإخراج المخفية. + +```py +>>> from transformers import TFDistilBertForQuestionAnswering + +>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased") +``` + + + +## مجزئ النصوص + +الفئة الأساسية الأخيرة التي تحتاجها قبل استخدام نموذج للبيانات النصية هي [مجزئ النصوص](main_classes/tokenizer) لتحويل النص الخام إلى تنسورات (tensors). هناك نوعان من المحولات الرموز التي يمكنك استخدامها مع 🤗 Transformers: + +- [`PreTrainedTokenizer`]: تنفيذ Python لمجزئ النصوص. + - [`PreTrainedTokenizerFast`]: مجزئ النصوص من مكتبة [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) المُبنية على لغة Rust. هذا النوع من المجزئات أسرع بكثير، خاصةً عند معالجة دفعات النصوص، وذلك بفضل تصميمه بلغة Rust. كما يوفر مجزئ النصوص السريع طرقًا إضافية مثل *مخطط الإزاحة* الذي يُطابق الرموز بكلماتها أو أحرفها الأصلية. + +يدعم كلا النوعين من المجزئات طرقًا شائعة مثل الترميز وفك الترميز، وإضافة رموز جديدة، وإدارة الرموز الخاصة. + + + +لا يدعم كل نموذج مجزئ النصوص سريع. الق نظرة على هذا [جدول](index#supported-frameworks) للتحقق مما إذا كان النموذج يحتوي على دعم مجزئ النصوص سريع. + + + +إذا دربت مجزئ النصوص خاص بك، فيمكنك إنشاء واحد من *قاموسك*:``` + +```py +>>> from transformers import DistilBertTokenizer + +>>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt"، do_lower_case=False، padding_side="left") +``` + +من المهم أن تتذكر أن قاموس مجزئ النصوص المُخصص سيكون مختلفًا عن قاموس مجزئ النصوص نموذج مُدرّب مسبقًا. يجب عليك استخدام قاموس نموذج مُدرّب مسبقًا إذا كنت تستخدم نموذجًا مُدرّبًا مسبقًا، وإلا فلن تكون المدخلات ذات معنى. قم بإنشاء مجزئ النصوص باستخدام قاموس نموذج مُدرّب مسبقًا باستخدام فئة [`DistilBertTokenizer`]: + +```py +>>> from transformers import DistilBertTokenizer + +>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased") +``` + +قم بإنشاء مجزئ نصوص سريع باستخدام فئة [`DistilBertTokenizerFast`]: + +```py +>>> from transformers import DistilBertTokenizerFast + +>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert/distilbert-base-uncased") +``` + + +افتراضيًا، سيحاول [`AutoTokenizer`] تحميل مجزئ نصوص سريع. يمكنك تعطيل هذا السلوك عن طريق تعيين `use_fast=False` في `from_pretrained`. + + +## معالج الصور + +يعالج معالج الصور بيانات الرؤية. وهو يرث من الفئة الأساسية [`~image_processing_utils.ImageProcessingMixin`]. + +لبناء معالج صور خاص بالنموذج المستخدم، أنشئ مثلاً مُعالج [`ViTImageProcessor`] افتراضيًا إذا كنت تستخدم [ViT](model_doc/vit) لتصنيف الصور: + +```py +>>> from transformers import ViTImageProcessor + +>>> vit_extractor = ViTImageProcessor() +>>> print(vit_extractor) +ViTImageProcessor { + "do_normalize": true, + "do_resize": true, + "image_processor_type": "ViTImageProcessor", + "image_mean": [ + 0.5, + 0.5, + 0.5 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "resample": 2, + "size": 224 +} +``` + + + +إذا كنت لا تبحث عن أي تخصيص، فما عليك سوى استخدام طريقة `from_pretrained` لتحميل معلمات معالج الصور الافتراضية للنموذج. + + + +عدل أيًا من معلمات [`ViTImageProcessor`] لإنشاء معالج الصور المخصص الخاص بك: + +```py +>>> from transformers import ViTImageProcessor + +>>> my_vit_extractor = ViTImageProcessor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3]) +>>> print(my_vit_extractor) +ViTImageProcessor { + "do_normalize": false, + "do_resize": true, + "image_processor_type": "ViTImageProcessor", + "image_mean": [ + 0.3, + 0.3, + 0.3 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "resample": "PIL.Image.BOX", + "size": 224 +} +``` +## العمود الفقري + +
+ +
+ +تتكون نماذج رؤية الحاسب من جزء أساسي، وجزء وسيط، وجزء معالجة نهائي. يستخرج الجزء الأساسي الميزات من صورة الإدخال، ويجمع الجزء الوسيط هذه الميزات المستخرجة ويعززها، ويُستخدم الجزء النهائي للمهمة الرئيسية (مثل اكتشاف الأجسام). ابدأ عبتهيئة الجزء الأساسي في تكوين النموذج وحدد ما إذا كنت تريد تحميل أوزان مدربة مسبقًا أو أوزانًا عشوائية. بعد ذلك، يمكنك تمرير تكوين النموذج إلى جزء المعالجة النهائي. + +على سبيل المثال، لتحميل [ResNet](../model_doc/resnet) backbone في نموذج [MaskFormer](../model_doc/maskformer) مع رأس تجزئة مثيل: + + + + +قم بتعيين `use_pretrained_backbone=True` لتحميل الأوزان المسبقة التدريب لـ ResNet للعمود الفقري. + +```py +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation + +config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=True) # تكوين الجزء الأساسي والجزء الوسيط +model = MaskFormerForInstanceSegmentation(config) # جزء المعالجة النهائي +``` + + + + +قم بتعيين `use_pretrained_backbone=False` لتهيئة جزء ResNet الأساسي بشكل عشوائي. + +```py +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation + +config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=False) # تكوين الجزء الأساسي والجزء الوسيط +model = MaskFormerForInstanceSegmentation(config) # جزء المعالجة النهائي +``` + +يمكنك أيضًا تحميل تكوين الجزء الأساسي بشكل منفصل، ثم تمريره إلى تكوين النموذج.``` + +```py +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig + +backbone_config = ResNetConfig() +config = MaskFormerConfig(backbone_config=backbone_config) +model = MaskFormerForInstanceSegmentation(config) +``` + + + + +يتم تحميل نماذج [timm](https://hf.co/docs/timm/index) داخل نموذج باستخدام `use_timm_backbone=True` أو باستخدام [`TimmBackbone`] و [`TimmBackboneConfig`]. + +استخدم `use_timm_backbone=True` و `use_pretrained_backbone=True` لتحميل أوزان timm المُدرّبة مسبقًا للجزء الأساسي. + +```python +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation + +config = MaskFormerConfig(backbone="resnet50", use_pretrained_backbone=True, use_timm_backbone=True) # تكوين الجزء الأساسي والجزء الوسيط +model = MaskFormerForInstanceSegmentation(config) # جزء المعالجة النهائي +``` + +قم بتعيين `use_timm_backbone=True` و `use_pretrained_backbone=False` لتحميل عمود فقري timm مبدئي عشوائي. + +```python +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation + +config = MaskFormerConfig(backbone="resnet50", use_pretrained_backbone=False, use_timm_backbone=True) # تكوين الجزء الأساسي والجزء الوسيط +model = MaskFormerForInstanceSegmentation(config) # جزء المعالجة النهائي +``` + +يمكنك أيضًا تحميل تكوين الجزء الأساسي واستخدامه لإنشاء `TimmBackbone` أو تمريره إلى تكوين النموذج. سيتم تحميلأوزان الجزء الأساسي لـ Timm المُدرّبة مسبقًا افتراضيًا. عيّن `use_pretrained_backbone=False` لتحميل الأوزان المبدئية العشوائية. + +```python +from transformers import TimmBackboneConfig, TimmBackbone + +backbone_config = TimmBackboneConfig("resnet50", use_pretrained_backbone=False) + +# قم بإنشاء مثيل من العمود الفقري +backbone = TimmBackbone(config=backbone_config) + +# قم بإنشاء نموذج باستخدام عمود فقري timm +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation + +config = MaskFormerConfig(backbone_config=backbone_config) +model = MaskFormerForInstanceSegmentation(config) +``` + +## مستخرج الميزات + +يقوم مُستخرج الميزات بمعالجة المدخلات الصوتية. يرث من فئة الأساس [`~feature_extraction_utils.FeatureExtractionMixin`]، وقد يرث أيضًا من فئة [`SequenceFeatureExtractor`] لمعالجة المدخلات الصوتية. + +للاستخدام، قم بإنشاء مستخرج ميزات مرتبط بالنموذج الذي تستخدمه. على سبيل المثال، قم بإنشاء مستخرج ميزات Wav2Vec2 الافتراضي إذا كنت تستخدم [Wav2Vec2](model_doc/wav2vec2) لتصنيف الصوت: + +```py +>>> from transformers import Wav2Vec2FeatureExtractor + +>>> w2v2_extractor = Wav2Vec2FeatureExtractor() +>>> print(w2v2_extractor) +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": false, + "sampling_rate": 16000 +} +``` + + +إذا لم تكن بحاجة لأي تخصيص، فاستخدم فقط طريقة `from_pretrained` لتحميل معلمات مستخرج الميزات الافتراضية للنموذج. + + +قم بتعديل أي من معلمات [`Wav2Vec2FeatureExtractor`] لإنشاء مستخرج ميزات مخصص: + +```py +>>> from transformers import Wav2Vec2FeatureExtractor + +>>> w2v2_extractor = Wav2Vec2FeatureExtractor(sampling_rate=8000، do_normalize=False) +>>> print(w2v2_extractor) +Wav2Vec2FeatureExtractor { + "do_normalize": false, + "feature_extractor_type": "Wav2Vec2FeatureExtractor"، + "feature_size": 1، + "padding_side": "right"، + "padding_value": 0.0، + "return_attention_mask": false، + "sampling_rate": 8000 +} +``` + +## المعالج + +بالنسبة للنماذج التي تدعم مهام الوسائط المتعددة، توفر مكتبة 🤗 Transformers فئة معالج تجمع بفاعلية فئات المعالجة مثل مستخرج الميزات ومقسّم الرموز في كائن واحد. على سبيل المثال، دعنا نستخدم [`Wav2Vec2Processor`] لمهمة التعرف الآلي على الكلام (ASR). تقوم مهمة ASR بتحويل الصوت إلى نص، لذلك ستحتاج إلى مستخرج ميزات ومقسّم رموز. + +قم بإنشاء مستخرج ميزات لمعالجة المدخلات الصوتية: + +```py +>>> from transformers import Wav2Vec2FeatureExtractor + +>>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True) +``` + +قم بإنشاء مقسّم رموز لمعالجة المدخلات النصية: + +```py +>>> from transformers import Wav2Vec2CTCTokenizer + +>>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt") +``` + +قم بدمج مستخرج الميزات ومقسّم الرموز في [`Wav2Vec2Processor`]: + +```py +>>> from transformers import Wav2Vec2Processor + +>>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) +``` + +باستخدام فئتين أساسيتين - التكوين والنموذج - بالإضافة إلى فئة معالجة مسبق (مقسّم رموز أو معالج صورة أو مستخرج ميزات أو معالج)، يمكنك إنشاء أي من النماذج التي تدعمها مكتبة 🤗 Transformers. يمكن تكوين كل من هذه الفئات الأساسية، مما يسمح لك باستخدام السمات المطلوبة. يمكنك بسهولة تهيئة نموذج للتدريب أو تعديل نموذج مدرب مسبقاً لإجراء ضبط دقيق. diff --git a/docs/source/ar/custom_models.md b/docs/source/ar/custom_models.md new file mode 100644 index 000000000000..daaba5e54ee2 --- /dev/null +++ b/docs/source/ar/custom_models.md @@ -0,0 +1,323 @@ +# بناء نماذج مخصصة + +تم تصميم مكتبة 🤗 Transformers لتكون قابلة للتوسيع بسهولة. كل نموذج مُشفّر بالكامل في مجلد فرعي معين بالمستودع، دون أي تجريد، لذلك يمكنك بسهولة نسخ ملف النمذجة وتعديله وفقًا لاحتياجاتك. + +إذا كنت تُنشئ نموذجًا جديدًا تمامًا، فقد يكون من الأسهل البدء من الصفر. في هذا البرنامج التعليمي، سنُرِيك كيفية كتابة نموذج مخصص وتكوينه ليُستخدم داخل Transformers، وكيفية مشاركته مع المجتمع (مع الكود الذي يعتمد عليه) بحيث يمكن لأي شخص استخدامه، حتى إذا لم يكن موجودًا في مكتبة 🤗 Transformers. سنرى كيفية البناء على المحولات ونوسّع الإطار باستخدام الأدوات التي يمكن استخدامها لتعديل سلوك الإطار (hooks) والتعليمات البرمجية المخصصة. + +سنوضح كل هذا من خلال نموذج ResNet، بتغليف فئة ResNet من +[مكتبة timm](https://github.com/rwightman/pytorch-image-models) داخل [`PreTrainedModel`]. + +## كتابة إعدادات مخصصة + +لنبدأ بكتابة إعدادات النموذج. إعدادات النموذج هو كائنٌ يحتوي على جميع المعلومات اللازمة لبنائه. كما سنرى لاحقًا، يتطلب النموذج كائن `config` لتهيئته، لذا يجب أن يكون هذا الكائن كاملاً. + + + +تتبع النماذج في مكتبة `transformers` اتفاقية قبول كائن `config` في دالة `__init__` الخاصة بها، ثم تمرر كائن `config` بالكامل إلى الطبقات الفرعية في النموذج، بدلاً من تقسيمه إلى معامﻻت متعددة. يؤدي كتابة نموذجك بهذا الأسلوب إلى كود أبسط مع "مصدر حقيقة" واضح لأي فرط معلمات، كما يسهل إعادة استخدام الكود من نماذج أخرى في `transformers`. + + + +في مثالنا، سنعدّل بعض الوسائط في فئة ResNet التي قد نرغب في ضبطها. ستعطينا التكوينات المختلفة أنواع ResNets المختلفة الممكنة. سنقوم بتخزين هذه الوسائط بعد التحقق من صحته. + +```python +from transformers import PretrainedConfig +from typing import List + + +class ResnetConfig(PretrainedConfig): + model_type = "resnet" + + def __init__( + self, + block_type="bottleneck", + layers: List[int] = [3, 4, 6, 3], + num_classes: int = 1000, + input_channels: int = 3, + cardinality: int = 1, + base_width: int = 64, + stem_width: int = 64, + stem_type: str = "", + avg_down: bool = False, + **kwargs, + ): + if block_type not in ["basic", "bottleneck"]: + raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.") + if stem_type not in ["", "deep", "deep-tiered"]: + raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.") + + self.block_type = block_type + self.layers = layers + self.num_classes = num_classes + self.input_channels = input_channels + self.cardinality = cardinality + self.base_width = base_width + self.stem_width = stem_width + self.stem_type = stem_type + self.avg_down = avg_down + super().__init__(**kwargs) +``` +الأشياء الثلاثة المهمة التي يجب تذكرها عند كتابة تكوينك الخاص هي: + +- يجب أن ترث من `PretrainedConfig`، +- يجب أن تقبل دالة `__init__` الخاصة بـ `PretrainedConfig` أي معامﻻت إضافية kwargs، +- يجب تمرير هذه المعامﻻت الإضافية إلى دالة `__init__` فى الفئة الأساسية الاعلى. + +يضمن الإرث حصولك على جميع الوظائف من مكتبة 🤗 Transformers، في حين أن القيدين التانى والثالث يأتيان من حقيقة أن `PretrainedConfig` لديه المزيد من الحقول أكثر من تلك التي تقوم بتعيينها. عند إعادة تحميل تكوين باستخدام طريقة `from_pretrained`، يجب أن يقبل تكوينك هذه الحقول ثم إرسالها إلى الفئة الأساسية الأعلى. + +تحديد `model_type` لتكوينك (هنا `model_type="resnet"`) ليس إلزاميًا، ما لم ترغب في +تسجيل نموذجك باستخدام الفئات التلقائية (راجع القسم الأخير). + +مع القيام بذلك، يمكنك بسهولة إنشاء تكوينك وحفظه مثلما تفعل مع أي تكوين نموذج آخر في +المكتبة. إليك كيفية إنشاء تكوين resnet50d وحفظه: + +```py +resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) +resnet50d_config.save_pretrained("custom-resnet") +``` + +سيؤدي هذا إلى حفظ ملف باسم `config.json` داخل مجلد `custom-resnet`. يمكنك بعد ذلك إعادة تحميل تكوينك باستخدام +طريقة `from_pretrained`: + +```py +resnet50d_config = ResnetConfig.from_pretrained("custom-resnet") +``` + +يمكنك أيضًا استخدام أي طريقة أخرى من فئة [`PretrainedConfig`]، مثل [`~PretrainedConfig.push_to_hub`] لتحميل تكوينك مباشرة إلى Hub. + +## كتابة نموذج مخصص + +الآن بعد أن أصبح لدينا تكوين ResNet، يمكننا المتابعة لإنشاء نموذجين: الأول يستخرج الميزات المخفية من دفعة من الصور (مثل [`BertModel`]) والآخر مناسب لتصنيف الصور (مثل [`BertForSequenceClassification`]). + + كما ذكرنا سابقًا، سنقوم ببناء نموذج مبسط لتسهيل الفهم في هذا المثال. الخطوة الوحيدة المطلوبة قبل كتابة هذه الفئة هي لربط أنواع وحدات البناء بفئات ذات وحدات بناء فعلية. بعد ذلك، يُعرّف النموذج من خلال التكوين عبر تمرير كل شيء إلى فئة `ResNet`: + +```py +from transformers import PreTrainedModel +from timm.models.resnet import BasicBlock, Bottleneck, ResNet +from .configuration_resnet import ResnetConfig + + +BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck} + + +class ResnetModel(PreTrainedModel): + config_class = ResnetConfig + + def __init__(self, config): + super().__init__(config) + block_layer = BLOCK_MAPPING[config.block_type] + self.model = ResNet( + block_layer, + config.layers, + num_classes=config.num_classes, + in_chans=config.input_channels, + cardinality=config.cardinality, + base_width=config.base_width, + stem_width=config.stem_width, + stem_type=config.stem_type, + avg_down=config.avg_down, + ) + + def forward(self, tensor): + return self.model.forward_features(tensor) +``` + +بالنسبة للنموذج الذي سيصنف الصور، فإننا نغير فقط طريقة التقديم: + +```py +import torch + + +class ResnetModelForImageClassification(PreTrainedModel): + config_class = ResnetConfig + + def __init__(self, config): + super().__init__(config) + block_layer = BLOCK_MAPPING[config.block_type] + self.model = ResNet( + block_layer, + config.layers, + num_classes=config.num_classes, + in_chans=config.input_channels, + cardinality=config.cardinality, + base_width=config.base_width, + stem_width=config.stem_width, + stem_type=config.stem_type, + avg_down=config.avg_down, + ) + + def forward(self, tensor, labels=None): + logits = self.model(tensor) + if labels is not None: + loss = torch.nn.cross_entropy(logits, labels) + return {"loss": loss, "logits": logits} + return {"logits": logits} +``` +في كلتا الحالتين، لاحظ كيف نرث من `PreTrainedModel` ونستدعي مُهيئ الفئة الرئيسية باستخدام `config` (كما تفعل عند إنشاء وحدة `torch.nn.Module` عادية). ليس من الضروري تعريف `config_class` إلا إذا كنت ترغب في تسجيل نموذجك مع الفئات التلقائية (راجع القسم الأخير). + + + +إذا كان نموذجك مشابهًا جدًا لنموذج داخل المكتبة، فيمكنك إعادة استخدام نفس التكوين مثل هذا النموذج. + + + +يمكن لنموذجك أن يعيد أي شيء تريده، ولكن إعادة قاموس مثلما فعلنا لـ +`ResnetModelForImageClassification`، مع تضمين الخسارة عند تمرير العلامات، سيجعل نموذجك قابلًا للاستخدام مباشرة داخل فئة [`Trainer`]. يعد استخدام تنسيق إخراج آخر أمرًا جيدًا طالما أنك تخطط لاستخدام حلقة تدريب خاصة بك أو مكتبة أخرى للتدريب. + +الآن بعد أن أصبح لدينا فئة النموذج، دعنا ننشئ واحدة: + +```py +resnet50d = ResnetModelForImageClassification(resnet50d_config) +``` + +يمكنك استخدام أي من طرق فئة [`PreTrainedModel`]، مثل [`~PreTrainedModel.save_pretrained`] أو +[`~PreTrainedModel.push_to_hub`]. سنستخدم الثاني في القسم التالي، وسنرى كيفية دفع أوزان النموذج مع كود نموذجنا. ولكن أولاً، دعنا نحمل بعض الأوزان المُعلمة مسبقًا داخل نموذجنا. + +في حالة الاستخدام الخاصة بك، فمن المحتمل أن تقوم بتدريب نموذجك المخصص على بياناتك الخاصة. للانتقال بسرعة خلال هذا البرنامج التعليمي، +سنستخدم الإصدار المُعلم مسبقًا من resnet50d. نظرًا لأن نموذجنا هو مجرد غلاف حوله، فمن السهل نقل هذه الأوزان: + +```py +import timm + +pretrained_model = timm.create_model("resnet50d", pretrained=True) +resnet50d.model.load_state_dict(pretrained_model.state_dict()) +``` + +الآن دعونا نرى كيفية التأكد من أنه عند قيامنا بـ [`~PreTrainedModel.save_pretrained`] أو [`~PreTrainedModel.push_to_hub`]، يتم حفظ كود النموذج. + +## تسجيل نموذج مع كود مخصص للفئات التلقائية + +إذا كنت تكتب مكتبة توسع 🤗 Transformers، فقد ترغب في توسيع الفئات التلقائية لتشمل نموذجك الخاص. يختلف هذا عن نشر الكود إلى Hub بمعنى أن المستخدمين سيحتاجون إلى استيراد مكتبتك للحصول على النماذج المخصصة (على عكس تنزيل كود النموذج تلقائيًا من Hub). + +ما دام تكوينك يحتوي على معامل `model_type` مختلفة عن أنواع النماذج الحالية، وأن فئات نماذجك لديك لديها الخصائص الصحيحة `config_class`، فيمكنك ببساطة إضافتها إلى الفئات التلقائية مثل هذا: + +```py +from transformers import AutoConfig, AutoModel, AutoModelForImageClassification + +AutoConfig.register("resnet", ResnetConfig) +AutoModel.register(ResnetConfig, ResnetModel) +AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification) +``` + +لاحظ أن الحجة الأولى المستخدمة عند تسجيل تكوينك المخصص لـ [`AutoConfig`] يجب أن تتطابق مع `model_type` +من تكوينك المخصص، والحجة الأولى المستخدمة عند تسجيل نماذجك المخصصة لأي فئة نموذج تلقائي يجب +أن تتطابق مع `config_class` من تلك النماذج. + +## إرسال الكود إلى Hub + + + +هذا API تجريبي وقد يكون له بعض التغييرات الطفيفة في الإصدارات القادمة. + + + +أولاً، تأكد من تعريف نموذجك بالكامل في ملف `.py`. يمكن أن يعتمد على الاستيراد النسبي لملفات أخرى طالما أن جميع الملفات موجودة في نفس الدليل (لا ندعم الوحدات الفرعية لهذه الميزة حتى الآن). في مثالنا، سنحدد ملف `modeling_resnet.py` وملف `configuration_resnet.py` في مجلد باسم "resnet_model" في دليل العمل الحالي. يحتوي ملف التكوين على كود لـ `ResnetConfig` ويحتوي ملف النمذجة على كود لـ `ResnetModel` و`ResnetModelForImageClassification`. + +``` +. +└── resnet_model + ├── __init__.py + ├── configuration_resnet.py + └── modeling_resnet.py +``` + +يمكن أن يكون ملف `__init__.py` فارغًا، فهو موجود فقط حتى يتمكن Python من اكتشاف أن `resnet_model` يمكن استخدامه كموديل. + + + +إذا كنت تقوم بنسخ ملفات النمذجة من المكتبة، فسوف تحتاج إلى استبدال جميع الواردات النسبية في أعلى الملف +لاستيرادها من حزمة `transformers`. + + + +لاحظ أنه يمكنك إعادة استخدام (أو توسيع) تكوين/نموذج موجود. + +لمشاركة نموذجك مع المجتمع، اتبع الخطوات التالية: أولاً، قم باستيراد نموذج ResNet والتكوين من الملفات التي تم إنشاؤها حديثًا: + +```py +from resnet_model.configuration_resnet import ResnetConfig +from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification +``` + +بعد ذلك، يجب عليك إخبار المكتبة بأنك تريد نسخ ملفات الكود الخاصة بهذه الكائنات عند استخدام طريقة `save_pretrained` +وتسجيلها بشكل صحيح باستخدام فئة تلقائية (خاصة للنماذج)، ما عليك سوى تشغيل: + +```py +ResnetConfig.register_for_auto_class() +ResnetModel.register_for_auto_class("AutoModel") +ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification") +``` + +لاحظ أنه لا توجد حاجة لتحديد فئة تلقائية للتكوين (هناك فئة تلقائية واحدة فقط لها، +[`AutoConfig`]) ولكن الأمر يختلف بالنسبة للنماذج. قد يكون نموذجك المخصص مناسبًا للعديد من المهام المختلفة، لذلك يجب +تحديد أي من الفئات التلقائية هو الصحيح لنموذجك. + + + +استخدم `register_for_auto_class()` إذا كنت تريد نسخ ملفات الكود. إذا كنت تفضل استخدام الكود على Hub من مستودع آخر، +فلا تحتاج إلى استدعائه. في الحالات التي يوجد فيها أكثر من فئة تلقائية واحدة، يمكنك تعديل ملف `config.json` مباشرة باستخدام +الهيكل التالي: + +```json +"auto_map": { + "AutoConfig": "--", + "AutoModel": "--", + "AutoModelFor": "--", +}, +``` + + + +بعد ذلك، دعنا نقوم بإنشاء التكوين والنماذج كما فعلنا من قبل: + +```py +resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) +resnet50d = ResnetModelForImageClassification(resnet50d_config) + +pretrained_model = timm.create_model("resnet50d", pretrained=True) +resnet50d.model.load_state_dict(pretrained_model.state_dict()) +``` + +الآن لإرسال النموذج إلى Hub، تأكد من تسجيل الدخول. إما تشغيل في المحطة الأوامر الطرفية الخاصة بك: + +```bash +huggingface-cli login +``` + +أو من دفتر ملاحظات: + +```py +from huggingface_hub import notebook_login + +notebook_login() +``` + +يمكنك بعد ذلك الضغط على مساحة الاسم الخاصة بك (أو منظمة أنت عضو فيها) مثل هذا: + +```py +resnet50d.push_to_hub("custom-resnet50d") +``` + +بالإضافة إلى أوزان النمذجة والتكوين بتنسيق json، فقد قام هذا أيضًا بنسخ ملفات النمذجة والتكوين `.py` في مجلد `custom-resnet50d` وتحميل النتيجة إلى Hub. يمكنك التحقق من النتيجة في هذا [مستودع النموذج](https://huggingface.co/sgugger/custom-resnet50d). + +راجع [البرنامج التعليمي للمشاركة](model_sharing) لمزيد من المعلومات حول طريقة الدفع إلى المحور. + +### استخدام نموذج مع كود مخصص + +يمكنك استخدام أي تكوين أو نموذج أو مقسم لغوي مع ملفات برمجة مخصصة في مستودعه باستخدام الفئات التلقائية و دالة `from_pretrained`.تُفحص جميع الملفات والرموز المرفوع إلى Hub بحثًا عن البرامج الضارة (راجع وثائق [أمان Hub](https://huggingface.co/docs/hub/security#malware-scanning) لمزيد من المعلومات)، ولكن يجب عليك مراجعة كود النموذج والمؤلف لتجنب تنفيذ التعليمات البرمجية الضارة على جهازك. لتفعيل نموذج يحتوي على شفرة برمجية مخصصة، عيّن `trust_remote_code=True`: + +```py +from transformers import AutoModelForImageClassification + +model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True) +``` + +يُنصح بشدة بتحديد رقم إصدار (commit hash) كـ `revision` للتأكد من عدم تعديل مؤلف النموذج للشفرة لاحقًابإضافة أسطر ضارة (إلا إذا كنت تثق تمامًا بمؤلفي النموذج): + +```py +commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292" +model = AutoModelForImageClassification.from_pretrained( + "sgugger/custom-resnet50d"، trust_remote_code=True، revision=commit_hash +) +``` + +لاحظ وجود زرّ لنسخ رقم إصدار بسهولة عند تصفح سجل التزامات مستودع النموذج على منصة Hugging Face. diff --git a/docs/source/ar/gguf.md b/docs/source/ar/gguf.md new file mode 100644 index 000000000000..cdb20c5640a6 --- /dev/null +++ b/docs/source/ar/gguf.md @@ -0,0 +1,89 @@ +# GGUF وتفاعلها مع المحولات + +تُستخدم صيغة ملف GGUF لتخزين النماذج للاستدلال باستخدام [GGML](https://github.com/ggerganov/ggml) والمكتبات الأخرى التي تعتمد عليه، مثل [llama.cpp](https://github.com/ggerganov/llama.cpp) أو [whisper.cpp](https://github.com/ggerganov/whisper.cpp) الشهيرة جدًا. + +إنها صيغة ملف [مدعومة من قبل Hugging Face Hub](https://huggingface.co/docs/hub/en/gguf) مع ميزات تسمح بالفحص السريع للموترات والبيانات الوصفية داخل الملف. + +تم تصميم تنسيق الملف هذا كـ "تنسيق ملف واحد" حيث يحتوي ملف واحد عادةً على كل من سمات التكوين ومفردات المجزىء اللغوي والخصائص الأخرى، بالإضافة إلى جميع الموترات التي سيتم تحميلها في النموذج. تأتي هذه الملفات بتنسيقات مختلفة وفقًا لنوع التكميم في الملف. نلقي نظرة موجزة على بعضها [هنا](https://huggingface.co/docs/hub/en/gguf#quantization-types). + +## الدعم داخل المحولات + +أضفنا القدرة على تحميل ملفات `gguf` داخل `المحولات` لتوفير قدرات تدريب/ضبط إضافية لنماذج gguf، قبل إعادة تحويل تلك النماذج إلى `gguf` لاستخدامها داخل نظام `ggml`. عند تحميل نموذج، نقوم أولاً بإلغاء تكميمه إلى fp32، قبل تحميل الأوزان لاستخدامها في PyTorch. + +> [!NOTE] +> لا يزال الدعم تجريبيًا للغاية ونرحب بالمساهمات من أجل ترسيخه عبر أنواع التكميم وبنى النماذج. + +فيما يلي، بنيات النماذج وأنواع التكميم المدعومة: + +### أنواع التكميم المدعومة + +تُحدد أنواع التكميم المدعومة مبدئيًا وفقًا لملفات التكميم الشائعة التي تمت مشاركتها على Hub. + +- F32 +- F16 +- BF16 +- Q4_0 +- Q4_1 +- Q5_0 +- Q5_1 +- Q8_0 +- Q2_K +- Q3_K +- Q4_K +- Q5_K +- Q6_K +- IQ1_S +- IQ1_M +- IQ2_XXS +- IQ2_XS +- IQ2_S +- IQ3_XXS +- IQ3_S +- IQ4_XS +- IQ4_NL + +> [!NOTE] +> لدعم إلغاء تكميم gguf، يلزم تثبيت `gguf>=0.10.0`. + +### بنيات النماذج المدعومة + +في الوقت الحالي، بنيات النماذج المدعومة هي البنيات التي كانت شائعة جدًا على Hub، وهي: + +- LLaMa +- Mistral +- Qwen2 +- Qwen2Moe +- Phi3 +- Bloom +- Falcon +- StableLM +- GPT2 +- Starcoder2 +- T5 + +## مثال الاستخدام + +لتحميل ملفات `gguf` في `transformers`، يجب تحديد معامل `gguf_file` فى دالة `from_pretrained` لكل من المُجزّئ اللغوية والنموذج. فيما يلي كيفية تحميل المُجزّئ اللغوي ونموذج، يمكن تحميلهما من نفس الملف: + +```py +from transformers import AutoTokenizer, AutoModelForCausalLM + +model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" +filename = "tinyllama-1.1b-chat-v1.0.Q6_K.gguf" + +tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename) +model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename) +``` + +الآن لديك إمكانية الوصول إلى النسخة الكامل غير المكممة للنموذج في بيئة PyTorch، حيث يمكنك دمجه مع مجموعة كبيرة من الأدوات الأخرى. + +لإعادة التحويل إلى ملف `gguf`، نوصي باستخدام ملف [`convert-hf-to-gguf.py`](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py) من llama.cpp. + +فيما يلي كيفية إكمال البرنامج النصي أعلاه لحفظ النموذج وإعادة تصديره مرة أخرى إلى `gguf`: + +```py +tokenizer.save_pretrained('directory') +model.save_pretrained('directory') + +!python ${path_to_llama_cpp}/convert-hf-to-gguf.py ${directory} +``` diff --git a/docs/source/ar/multilingual.md b/docs/source/ar/multilingual.md new file mode 100644 index 000000000000..b4b2a94fd40a --- /dev/null +++ b/docs/source/ar/multilingual.md @@ -0,0 +1,160 @@ +# النماذج متعددة اللغات للاستدلال + +هناك العديد من النماذج متعددة اللغات في مكتبة 🤗 Transformers، وتختلف طريقة استخدامها للاستدلال عن النماذج أحادية اللغة. ولكن ليس كل استخدام النماذج متعددة اللغات مختلف. فبعض النماذج، مثل [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased)، يمكن استخدامها تمامًا مثل النموذج أحادي اللغة. سيوضح لك هذا الدليل كيفية استخدام النماذج متعددة اللغات التي تختلف طريقة استخدامها للاستدلال. + +## XLM + +يحتوي XLM على عشر نسخ مختلفة، واحدة منها فقط أحادية اللغة. ويمكن تقسيم نسخ النماذج التسع المتبقية إلى فئتين: نسخ التي تستخدم تضمينات اللغة (language embeddings) وتلك التي لا تستخدمها. + +### XLM مع تضمينات اللغة + +تستخدم النماذج التالية من XLM تضمينات اللغة لتحديد اللغة المستخدمة أثناء الاستدلال: + +- `FacebookAI/xlm-mlm-ende-1024` (نمذجة اللغة المقنعة، الإنجليزية-الألمانية) +- `FacebookAI/xlm-mlm-enfr-1024` (نمذجة اللغة المقنعة، الإنجليزية-الفرنسية) +- `FacebookAI/xlm-mlm-enro-1024` (نمذجة اللغة المقنعة، الإنجليزية-الرومانية) +- `FacebookAI/xlm-mlm-xnli15-1024` (نمذجة اللغة المقنعة، لغات XNLI) +- `FacebookAI/xlm-mlm-tlm-xnli15-1024` (نمذجة اللغة المقنعة + الترجمة، لغات XNLI) +- `FacebookAI/xlm-clm-enfr-1024` (نمذجة اللغة السببية، الإنجليزية-الفرنسية) +- `FacebookAI/xlm-clm-ende-1024` (نمذجة اللغة السببية، الإنجليزية-الألمانية) + +تُمثل تضمينات اللغة على شكل مصفوفة بنفس شكل `input_ids` التي يتم تمريره إلى النموذج. وتعتمد القيم في هذه المصفوفات على اللغة المستخدمة ويتم تحديدها بواسطة معاملى المجزىء `lang2id` و `id2lang`. + +في هذا المثال، قم بتحميل نسخة `FacebookAI/xlm-clm-enfr-1024` ( نمذجة اللغة السببية، الإنجليزية-الفرنسية): + +```py +>>> import torch +>>> from transformers import XLMTokenizer, XLMWithLMHeadModel + +>>> tokenizer = XLMTokenizer.from_pretrained("FacebookAI/xlm-clm-enfr-1024") +>>> model = XLMWithLMHeadModel.from_pretrained("FacebookAI/xlm-clm-enfr-1024") +``` + +تُظهر خاصية `lang2id` في المجزىء اللغات وأرقام تعريفها في هذا النموذج: + +```py +>>> print(tokenizer.lang2id) +{'en': 0, 'fr': 1} +``` + +بعد ذلك، قم بإنشاء مثال على المدخلات: + +```py +>>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1 +``` + +قم بتعيين معرف اللغة إلى `"en"` واستخدمه لتحديد تضمين اللغة. وتضمين اللغة عبارة عن مصفوفة مملوءة بـ `0` لأن هذا هو معرف اللغة الإنجليزية. يجب أن تكون هذه المصفوفة بنفس حجم `input_ids`. + +```py +>>> language_id = tokenizer.lang2id["en"] # 0 +>>> langs = torch.tensor([language_id] * input_ids.shape[1]) # torch.tensor([0, 0, 0, ..., 0]) + +>>> # نقوم بإعادة تشكيلها لتكون بالحجم (batch_size، sequence_length) +>>> langs = langs.view(1, -1) # الآن بالحجم [1، sequence_length] (لدينا batch size تساوي 1) +``` + +الآن يمكنك تمرير `input_ids` وتضمين اللغة إلى النموذج: + +```py +>>> outputs = model(input_ids, langs=langs) +``` + +يمكن لنص البرنامج النصي [run_generation.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation/run_generation.py) توليد النص باستخدام تضمينات اللغة مع نقاط تفتيش `xlm-clm`. + +### XLM بدون تضمينات اللغة + +النماذج التالية من XLM لا تتطلب تضمينات اللغة أثناء الاستنتاج: + +- `FacebookAI/xlm-mlm-17-1280` (نمذجة اللغة المقنعة، 17 لغة) +- `FacebookAI/xlm-mlm-100-1280` (نمذجة اللغة المقنعة، 100 لغة) + +تُستخدم هذه النماذج لتمثيل الجمل العامة، على عكس نسح XLM السابقة. + +## BERT + +يمكن استخدام النماذج التالية من BERT للمهام متعددة اللغات: + +- `google-bert/bert-base-multilingual-uncased` (نمذجة اللغة المقنعة + التنبؤ بالجملة التالية، 102 لغة) +- `google-bert/bert-base-multilingual-cased` (نمذجة اللغة المقنعة + التنبؤ بالجملة التالية، 104 لغات) + +لا تتطلب هذه النماذج تضمينات اللغة أثناء الاستدلال. يجب أن تُحدّد اللغة من السياق وتستنتج وفقاً لذلك. + +## XLM-RoBERTa + +يمكن استخدام النماذج التالية من XLM-RoBERTa للمهام متعددة اللغات: + +- `FacebookAI/xlm-roberta-base` (نمذجة اللغة المقنعة، 100 لغة) +- `FacebookAI/xlm-roberta-large` (نمذجة اللغة المقنعة، 100 لغة) + +تم تدريب XLM-RoBERTa على 2.5 تيرابايت من بيانات CommonCrawl الجديدة والمحسنة في 100 لغة. ويوفر مكاسب قوية على النماذج متعددة اللغات التي تم إصدارها سابقاً مثل mBERT أو XLM في مهام المصب مثل التصنيف، ووضع العلامات التسلسلية، والأسئلة والأجوبة. + +## M2M100 + +يمكن استخدام النماذج التالية من M2M100 للترجمة متعددة اللغات: + +- `facebook/m2m100_418M` (الترجمة) +- `facebook/m2m100_1.2B` (الترجمة) + +في هذا المثال، قم بتحميل نسحة `facebook/m2m100_418M` لترجمة النص من الصينية إلى الإنجليزية. يمكنك تعيين اللغة المصدر في المجزىء اللغوى: + +```py +>>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer + +>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." +>>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒." + +>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh") +>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") +``` + +تقسيم النّص إلى رموز: + +```py +>>> encoded_zh = tokenizer(chinese_text, return_tensors="pt") +``` + +يجبر M2M100 معرف اللغة الهدف كأول رمز مولد للترجمة إلى اللغة الهدف. قم بتعيين `forced_bos_token_id` إلى `en` في طريقة `generate` للترجمة إلى الإنجليزية: + +```py +>>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en")) +>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.' +``` + +## MBart + +يمكن استخدام النماذج التالية من MBart للترجمة متعددة اللغات: + +- `facebook/mbart-large-50-one-to-many-mmt` (الترجمة الآلية متعددة اللغات من واحد إلى كثير، 50 لغة) +- `facebook/mbart-large-50-many-to-many-mmt` (الترجمة الآلية متعددة اللغات من كثير إلى كثير، 50 لغة) +- `facebook/mbart-large-50-many-to-one-mmt` (الترجمة الآلية متعددة اللغات من كثير إلى واحد، 50 لغة) +- `facebook/mbart-large-50` (الترجمة متعددة اللغات، 50 لغة) +- `facebook/mbart-large-cc25` + +في هذا المثال، قم بتحميل نسخة `facebook/mbart-large-50-many-to-many-mmt` لترجمة النص من الفنلندية إلى الإنجليزية. يمكنك تعيين اللغة المصدر في المجزىء: + +```py +>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM + +>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." +>>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia." + +>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI") +>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") +``` + +تقسيم النّص إلى رموز: + +```py +>>> encoded_en = tokenizer(en_text, return_tensors="pt") +``` + +يجبر MBart معرف لغة الهدف كأول رمز مولد للترجمة إلى اللغة الهدف. قم بتعيين `forced_bos_token_id` إلى `en` في طريقة `generate` للترجمة إلى الإنجليزية: + +```py +>>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]) +>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry." +``` + +إذا كنت تستخدم نسخة `facebook/mbart-large-50-many-to-one-mmt`، فلا تحتاج إلى إجبار معرف لغة الهدف كأول رمز مولد، وإلا فإن الاستخدام هو نفسه. \ No newline at end of file diff --git a/docs/source/ar/tflite.md b/docs/source/ar/tflite.md new file mode 100644 index 000000000000..5e75c7a10a3c --- /dev/null +++ b/docs/source/ar/tflite.md @@ -0,0 +1,40 @@ +# التصدير إلى TFLite + +[TensorFlow Lite](https://www.tensorflow.org/lite/guide) هو إطار عمل خفيف الوزن لنشر نماذج التعلم الآلي على الأجهزة المحدودة الموارد، مثل الهواتف المحمولة، والأنظمة المدمجة، وأجهزة إنترنت الأشياء (IoT). تم تصميم TFLite لتشغيل النماذج وتحسينها بكفاءة على هذه الأجهزة ذات الطاقة الحاسوبية والذاكرة واستهلاك الطاقة المحدودة. + +يُمثَّل نموذج TensorFlow Lite بتنسيق محمول فعال خاص يُعرَّف بامتداد الملف `.tflite`. + +🤗 Optimum يقدم وظيفة لتصدير نماذج 🤗 Transformers إلى TFLite من خلال الوحدة النمطية `exporters.tflite`. بالنسبة لقائمة هندسات النماذج المدعومة، يرجى الرجوع إلى [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/tflite/overview). + +لتصدير نموذج إلى TFLite، قم بتثبيت متطلبات البرنامج المطلوبة: + +```bash +pip install optimum[exporters-tf] +``` + +للاطلاع على جميع المغامﻻت المتاحة، راجع [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model)، أو عرض المساعدة في سطر الأوامر: + +```bash +optimum-cli export tflite --help +``` + +لتصدير نسخة النموذج ل 🤗 Hub، على سبيل المثال، `google-bert/bert-base-uncased`، قم بتشغيل الأمر التالي: + +```bash +optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/ +``` + +ستظهر لك السجلات التي تُبيّن التقدم وموقع حفظ ملف `model.tflite` الناتج، كما في المثال التالي: + +```bash +Validating TFLite model... + -[✓] TFLite model output names match reference model (logits) + - Validating TFLite Model output "logits": + -[✓] (1, 128, 30522) matches (1, 128, 30522) + -[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05) +The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05: +- logits: max diff = 5.817413330078125e-05. + The exported model was saved at: bert_tflite +``` + +يُبيّن المثال أعلاه كيفية تصدير نسخة من النموذج ل 🤗 Hub. عند تصدير نموذج محلي، تأكد أولاً من حفظ ملفات أوزان النموذج المجزء اللغوى في نفس المسار (`local_path`). عند استخدام CLI، قم بتمرير `local_path` إلى معامل `model` بدلاً من اسم النسخة على 🤗 Hub. \ No newline at end of file diff --git a/docs/source/en/gguf.md b/docs/source/en/gguf.md index 20531b990bc3..2da721b28986 100644 --- a/docs/source/en/gguf.md +++ b/docs/source/en/gguf.md @@ -86,6 +86,7 @@ For now the supported model architectures are the architectures that have been v - GPT2 - Starcoder2 - T5 +- Mamba ## Example usage diff --git a/docs/source/en/main_classes/image_processor.md b/docs/source/en/main_classes/image_processor.md index 59a78e68214d..320916f1ce94 100644 --- a/docs/source/en/main_classes/image_processor.md +++ b/docs/source/en/main_classes/image_processor.md @@ -18,6 +18,49 @@ rendered properly in your Markdown viewer. An image processor is in charge of preparing input features for vision models and post processing their outputs. This includes transformations such as resizing, normalization, and conversion to PyTorch, TensorFlow, Flax and Numpy tensors. It may also include model specific post-processing such as converting logits to segmentation masks. +Fast image processors are available for a few models and more will be added in the future. They are based on the [torchvision](https://pytorch.org/vision/stable/index.html) library and provide a significant speed-up, especially when processing on GPU. +They have the same API as the base image processors and can be used as drop-in replacements. +To use a fast image processor, you need to install the `torchvision` library, and set the `use_fast` argument to `True` when instantiating the image processor: + +```python +from transformers import AutoImageProcessor + +processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50", use_fast=True) +``` + +When using a fast image processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise. + +```python +from torchvision.io import read_image +from transformers import DetrImageProcessorFast + +images = read_image("image.jpg") +processor = DetrImageProcessorFast.from_pretrained("facebook/detr-resnet-50") +images_processed = processor(images, return_tensors="pt", device="cuda") +``` + +Here are some speed comparisons between the base and fast image processors for the `DETR` and `RT-DETR` models, and how they impact overall inference time: + +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +These benchmarks were run on an [AWS EC2 g5.2xlarge instance](https://aws.amazon.com/ec2/instance-types/g5/), utilizing an NVIDIA A10G Tensor Core GPU. + ## ImageProcessingMixin diff --git a/docs/source/en/main_classes/pipelines.md b/docs/source/en/main_classes/pipelines.md index d5d132aaaba5..59e474fcc49f 100644 --- a/docs/source/en/main_classes/pipelines.md +++ b/docs/source/en/main_classes/pipelines.md @@ -478,6 +478,12 @@ Pipelines available for multimodal tasks include the following. - __call__ - all +### ImageTextToTextPipeline + +[[autodoc]] ImageTextToTextPipeline + - __call__ + - all + ### MaskGenerationPipeline [[autodoc]] MaskGenerationPipeline diff --git a/docs/source/en/main_classes/tokenizer.md b/docs/source/en/main_classes/tokenizer.md index 2ad7e450404e..83d2ae5df6a7 100644 --- a/docs/source/en/main_classes/tokenizer.md +++ b/docs/source/en/main_classes/tokenizer.md @@ -51,6 +51,25 @@ token space (e.g., getting the index of the token comprising a given character o to a given token). +# Multimodal Tokenizer + +Apart from that each tokenizer can be a "multimodal" tokenizer which means that the tokenizer will hold all relevant special tokens +as part of tokenizer attributes for easier access. For example, if the tokenizer is loaded from a vision-language model like LLaVA, you will +be able to access `tokenizer.image_token_id` to obtain the special image token used as a placeholder. + +To enable extra special tokens for any type of tokenizer, you have to add the following lines and save the tokenizer. Extra special tokens do not +have to be modality related and can ne anything that the model often needs access to. In the below code, tokenizer at `output_dir` will have direct access +to three more special tokens. + +```python +vision_tokenizer = AutoTokenizer.from_pretrained( + "llava-hf/llava-1.5-7b-hf", + extra_special_tokens={"image_token": "", "boi_token": "", "eoi_token": ""} +) +print(vision_tokenizer.image_token, vision_tokenizer.image_token_id) +("", 32000) +``` + ## PreTrainedTokenizer [[autodoc]] PreTrainedTokenizer diff --git a/docs/source/en/model_doc/mllama.md b/docs/source/en/model_doc/mllama.md index 9cb038ed2e34..4a6080ea2ce0 100644 --- a/docs/source/en/model_doc/mllama.md +++ b/docs/source/en/model_doc/mllama.md @@ -30,6 +30,25 @@ The Llama 3.2-Vision collection of multimodal large language models (LLMs) is a - The text passed to the processor should have the `"<|image|>"` tokens where the images should be inserted. - The processor has its own `apply_chat_template` method to convert chat messages to text that can then be passed as text to the processor. + + + +Mllama has an extra token used as a placeholder for image positions in the text. It means that input ids and an input embedding layer will have an extra token. But since the weights for input and output embeddings are not tied, the `lm_head` layer has one less token and will fail if you want to calculate loss on image tokens or apply some logit processors. In case you are training, make sure to mask out special `"<|image|>"` tokens in the `labels` as the model should not be trained on predicting them. + +Otherwise if you see CUDA-side index erros when generating, use the below code to expand the `lm_head` by one more token. + + +```python +old_embeddings = model.get_output_embeddings() + +num_tokens = model.vocab_size + 1 +resized_embeddings = model._get_resized_lm_head(old_embeddings, new_num_tokens=num_tokens, mean_resizing=True) +resized_embeddings.requires_grad_(old_embeddings.weight.requires_grad) +model.set_output_embeddings(resized_embeddings) +``` + + + ## Usage Example #### Instruct model diff --git a/docs/source/en/model_doc/rt_detr.md b/docs/source/en/model_doc/rt_detr.md index 5540266c6215..8ad220dc4bd1 100644 --- a/docs/source/en/model_doc/rt_detr.md +++ b/docs/source/en/model_doc/rt_detr.md @@ -46,7 +46,7 @@ Initially, an image is processed using a pre-trained convolutional neural networ >>> from PIL import Image >>> from transformers import RTDetrForObjectDetection, RTDetrImageProcessor ->>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' +>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' >>> image = Image.open(requests.get(url, stream=True).raw) >>> image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd") @@ -95,6 +95,12 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h - preprocess - post_process_object_detection +## RTDetrImageProcessorFast + +[[autodoc]] RTDetrImageProcessorFast + - preprocess + - post_process_object_detection + ## RTDetrModel [[autodoc]] RTDetrModel diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md index b9aab2f1b929..59e451adceb8 100644 --- a/docs/source/en/model_doc/superpoint.md +++ b/docs/source/en/model_doc/superpoint.md @@ -86,24 +86,32 @@ model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/sup inputs = processor(images, return_tensors="pt") outputs = model(**inputs) - -for i in range(len(images)): - image_mask = outputs.mask[i] - image_indices = torch.nonzero(image_mask).squeeze() - image_keypoints = outputs.keypoints[i][image_indices] - image_scores = outputs.scores[i][image_indices] - image_descriptors = outputs.descriptors[i][image_indices] +image_sizes = [(image.height, image.width) for image in images] +outputs = processor.post_process_keypoint_detection(outputs, image_sizes) + +for output in outputs: + for keypoints, scores, descriptors in zip(output["keypoints"], output["scores"], output["descriptors"]): + print(f"Keypoints: {keypoints}") + print(f"Scores: {scores}") + print(f"Descriptors: {descriptors}") ``` -You can then print the keypoints on the image to visualize the result : +You can then print the keypoints on the image of your choice to visualize the result: ```python -import cv2 -for keypoint, score in zip(image_keypoints, image_scores): - keypoint_x, keypoint_y = int(keypoint[0].item()), int(keypoint[1].item()) - color = tuple([score.item() * 255] * 3) - image = cv2.circle(image, (keypoint_x, keypoint_y), 2, color) -cv2.imwrite("output_image.png", image) +import matplotlib.pyplot as plt + +plt.axis("off") +plt.imshow(image_1) +plt.scatter( + outputs[0]["keypoints"][:, 0], + outputs[0]["keypoints"][:, 1], + c=outputs[0]["scores"] * 100, + s=outputs[0]["scores"] * 50, + alpha=0.8 +) +plt.savefig(f"output_image.png") ``` +![image/png](https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/ZtFmphEhx8tcbEQqOolyE.png) This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille). The original code can be found [here](https://github.com/magicleap/SuperPointPretrainedNetwork). @@ -123,6 +131,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h [[autodoc]] SuperPointImageProcessor - preprocess +- post_process_keypoint_detection ## SuperPointForKeypointDetection diff --git a/docs/source/en/perf_train_cpu.md b/docs/source/en/perf_train_cpu.md index 14a52792d1f7..7ef98932d537 100644 --- a/docs/source/en/perf_train_cpu.md +++ b/docs/source/en/perf_train_cpu.md @@ -18,11 +18,11 @@ rendered properly in your Markdown viewer. This guide focuses on training large models efficiently on CPU. ## Mixed precision with IPEX -Mixed precision uses single (fp32) and half-precision (bf16/fp16) data types in a model to accelerate training or inference while still preserving much of the single-precision accuracy. Modern CPUs such as 3rd and 4th Gen Intel® Xeon® Scalable processors natively support bf16, so you should get more performance out of the box by enabling mixed precision training with bf16. +Mixed precision uses single (fp32) and half-precision (bf16/fp16) data types in a model to accelerate training or inference while still preserving much of the single-precision accuracy. Modern CPUs such as 3rd, 4th, and 5th Gen Intel® Xeon® Scalable processors natively support bf16. 6th Gen Intel® Xeon® Scalable processors natively support bf16 and fp16. You should get more performance out of the box by enabling mixed precision training with bf16 or fp16. To further maximize training performance, you can use Intel® Extension for PyTorch (IPEX), which is a library built on PyTorch and adds additional CPU instruction level architecture (ISA) level support such as Intel® Advanced Vector Extensions 512 Vector Neural Network Instructions (Intel® AVX512-VNNI), and Intel® Advanced Matrix Extensions (Intel® AMX) for an extra performance boost on Intel CPUs. However, CPUs with only AVX2 (e.g., AMD or older Intel CPUs) are not guaranteed to have better performance under IPEX. -Auto Mixed Precision (AMP) for CPU backends has been enabled since PyTorch 1.10. AMP support for bf16 on CPUs and bf16 operator optimization is also supported in IPEX and partially upstreamed to the main PyTorch branch. You can get better performance and user experience with IPEX AMP. +Auto Mixed Precision (AMP) for CPU backends has been enabled since PyTorch 1.10. AMP support for bf16/fp16 on CPUs and bf16/fp16 operator optimization is also supported in IPEX and partially upstreamed to the main PyTorch branch. You can get better performance and user experience with IPEX AMP. Check more detailed information for [Auto Mixed Precision](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/amp.html). @@ -32,10 +32,10 @@ IPEX release is following PyTorch, to install via pip: | PyTorch Version | IPEX version | | :---------------: | :----------: | -| 2.1.x | 2.1.100+cpu | -| 2.0.x | 2.0.100+cpu | -| 1.13 | 1.13.0+cpu | -| 1.12 | 1.12.300+cpu | +| 2.5.0 | 2.5.0+cpu | +| 2.4.0 | 2.4.0+cpu | +| 2.3.0 | 2.3.0+cpu | +| 2.2.0 | 2.2.0+cpu | Please run `pip list | grep torch` to get your `pytorch_version`, so you can get the `IPEX version_name`. ```bash @@ -46,7 +46,7 @@ You can check the latest versions in [ipex-whl-stable-cpu](https://developer.int Check more approaches for [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/installation.html). ### Usage in Trainer -To enable auto mixed precision with IPEX in Trainer, users should add `use_ipex`, `bf16` and `no_cuda` in training command arguments. +To enable auto mixed precision with IPEX in Trainer, users should add `use_ipex`, `bf16` or `fp16`, and `no_cuda` in training command arguments. Take an example of the use cases on [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) diff --git a/docs/source/en/perf_train_cpu_many.md b/docs/source/en/perf_train_cpu_many.md index f528378bd1b8..ed782caca3b1 100644 --- a/docs/source/en/perf_train_cpu_many.md +++ b/docs/source/en/perf_train_cpu_many.md @@ -30,46 +30,32 @@ Check more detailed information for [oneccl_bind_pt](https://github.com/intel/to Wheel files are available for the following Python versions: -| Extension Version | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 | -| :---------------: | :--------: | :--------: | :--------: | :--------: | :---------: | -| 2.1.0 | | √ | √ | √ | √ | -| 2.0.0 | | √ | √ | √ | √ | -| 1.13.0 | | √ | √ | √ | √ | -| 1.12.100 | | √ | √ | √ | √ | -| 1.12.0 | | √ | √ | √ | √ | +| Extension Version | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 | Python 3.11 | +| :---------------: | :--------: | :--------: | :--------: | :---------: | :---------: | +| 2.5.0 | | √ | √ | √ | √ | +| 2.4.0 | | √ | √ | √ | √ | +| 2.3.0 | | √ | √ | √ | √ | +| 2.2.0 | | √ | √ | √ | √ | Please run `pip list | grep torch` to get your `pytorch_version`. ```bash pip install oneccl_bind_pt=={pytorch_version} -f https://developer.intel.com/ipex-whl-stable-cpu ``` -where `{pytorch_version}` should be your PyTorch version, for instance 2.1.0. +where `{pytorch_version}` should be your PyTorch version, for instance 2.4.0. Check more approaches for [oneccl_bind_pt installation](https://github.com/intel/torch-ccl). Versions of oneCCL and PyTorch must match. - - -oneccl_bindings_for_pytorch 1.12.0 prebuilt wheel does not work with PyTorch 1.12.1 (it is for PyTorch 1.12.0) -PyTorch 1.12.1 should work with oneccl_bindings_for_pytorch 1.12.100 - - ## Intel® MPI library Use this standards-based MPI implementation to deliver flexible, efficient, scalable cluster messaging on Intel® architecture. This component is part of the Intel® oneAPI HPC Toolkit. oneccl_bindings_for_pytorch is installed along with the MPI tool set. Need to source the environment before using it. -for Intel® oneCCL >= 1.12.0 ```bash oneccl_bindings_for_pytorch_path=$(python -c "from oneccl_bindings_for_pytorch import cwd; print(cwd)") source $oneccl_bindings_for_pytorch_path/env/setvars.sh ``` -for Intel® oneCCL whose version < 1.12.0 -```bash -torch_ccl_path=$(python -c "import torch; import torch_ccl; import os; print(os.path.abspath(os.path.dirname(torch_ccl.__file__)))") -source $torch_ccl_path/env/setvars.sh -``` - #### Intel® Extension for PyTorch installation Intel Extension for PyTorch (IPEX) provides performance optimizations for CPU training with both Float32 and BFloat16 (refer to the [single CPU section](./perf_train_cpu) to learn more). @@ -155,7 +141,7 @@ This example assumes that you have: The snippet below is an example of a Dockerfile that uses a base image that supports distributed CPU training and then extracts a Transformers release to the `/workspace` directory, so that the example scripts are included in the image: ```dockerfile -FROM intel/intel-optimized-pytorch:2.3.0-pip-multinode +FROM intel/intel-optimized-pytorch:2.4.0-pip-multinode RUN apt-get update -y && \ apt-get install -y --no-install-recommends --fix-missing \ @@ -165,7 +151,7 @@ RUN apt-get update -y && \ WORKDIR /workspace # Download and extract the transformers code -ARG HF_TRANSFORMERS_VER="4.44.0" +ARG HF_TRANSFORMERS_VER="4.46.0" RUN pip install --no-cache-dir \ transformers==${HF_TRANSFORMERS_VER} && \ mkdir transformers && \ @@ -319,4 +305,4 @@ with the job, the PyTorchJob resource can be deleted from the cluster using `kub This guide covered running distributed PyTorch training jobs using multiple CPUs on bare metal and on a Kubernetes cluster. Both cases utilize Intel Extension for PyTorch and Intel oneCCL Bindings for PyTorch for optimal training -performance, and can be used as a template to run your own workload on multiple nodes. \ No newline at end of file +performance, and can be used as a template to run your own workload on multiple nodes. diff --git a/docs/source/en/perplexity.md b/docs/source/en/perplexity.md index 7555619fe488..ac7ef8504e72 100644 --- a/docs/source/en/perplexity.md +++ b/docs/source/en/perplexity.md @@ -107,7 +107,8 @@ max_length = model.config.n_positions stride = 512 seq_len = encodings.input_ids.size(1) -nlls = [] +nll_sum = 0.0 +n_tokens = 0 prev_end_loc = 0 for begin_loc in tqdm(range(0, seq_len, stride)): end_loc = min(begin_loc + max_length, seq_len) @@ -124,13 +125,19 @@ for begin_loc in tqdm(range(0, seq_len, stride)): # to the left by 1. neg_log_likelihood = outputs.loss - nlls.append(neg_log_likelihood) + # Accumulate the total negative log-likelihood and the total number of tokens + num_valid_tokens = (target_ids != -100).sum().item() # number of valid tokens in target_ids + batch_size = target_ids.size(0) + num_loss_tokens = num_valid_tokens - batch_size # subtract batch_size due to internal label shift + nll_sum += neg_log_likelihood * num_loss_tokens + n_tokens += num_loss_tokens prev_end_loc = end_loc if end_loc == seq_len: break -ppl = torch.exp(torch.stack(nlls).mean()) +avg_nll = nll_sum / n_tokens # average negative log-likelihood per token +ppl = torch.exp(avg_nll) ``` Running this with the stride length equal to the max input length is equivalent to the suboptimal, non-sliding-window @@ -139,5 +146,5 @@ and the better the reported perplexity will typically be. When we run the above with `stride = 1024`, i.e. no overlap, the resulting PPL is `19.44`, which is about the same as the `19.93` reported in the GPT-2 paper. By using `stride = 512` and thereby employing our striding window -strategy, this jumps down to `16.45`. This is not only a more favorable score, but is calculated in a way that is +strategy, this jumps down to `16.44`. This is not only a more favorable score, but is calculated in a way that is closer to the true autoregressive decomposition of a sequence likelihood. diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md index f9ea33376994..7bee34728927 100644 --- a/docs/source/en/trainer.md +++ b/docs/source/en/trainer.md @@ -252,7 +252,70 @@ trainer = Trainer(..., args=training_args) NEFTune is disabled after training to restore the original embedding layer to avoid any unexpected behavior. -## GaLore +## Liger Kernel + +[Liger-Kernel](https://github.com/linkedin/Liger-Kernel) Kernel is a collection of Triton kernels developed by Linkedin designed specifically for LLM training. We have implemented Hugging Face Compatible RMSNorm, RoPE, SwiGLU, CrossEntropy, FusedLinearCrossEntropy, and more to come. It can effectively increase multi-GPU training throughput by 20% and reduces memory usage by 60%. The kernel works out of the box with flash attention, PyTorch FSDP, and Microsoft DeepSpeed. + + +Gain +20% throughput and reduce memory usage by 60% on LLaMA 3-8B model training. Achieve longer context lengths and larger batch sizes. It’s also useful if you want to scale up your model to multi-head training or large vocabulary sizes. Unleash multi-head training (medusa) and more. See details and examples in [Liger](https://github.com/linkedin/Liger-Kernel/tree/main/examples) + + +First make sure to install Liger official repository: +```bash +pip install liger-kernel +``` + +You should pass `use_liger_kernel=True` to apply liger kernel on your model, for example: + +```py +from transformers import TrainingArguments + +training_args = TrainingArguments( + output_dir="your-model", + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=2, + weight_decay=0.01, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + push_to_hub=True, + use_liger_kernel=True +) +``` + +The kernel supports the Llama, Gemma, Mistral, and Mixtral model architectures. The most up-to-date list of supported models can be found [here](https://github.com/linkedin/Liger-Kernel). When `use_liger_kernel` is set to `True`, the corresponding layers in the original model will be patched with Liger's efficient implementation, so you don't need to do anything extra other than setting the argument value. + + +## Optimizers + +You can choose a built-in optimizer for training using: + +```python +from transformers import TrainingArguments +training_args = TrainingArguments(..., optim="adamw_torch") +``` + +See [`OptimizerNames`](https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py) for a full list of choices. We include advanced examples in the sections below. + +You can also use an arbitrary PyTorch optimizer via: + +```python +import torch + +optimizer_cls = torch.optim.AdamW +optimizer_kwargs = { + "lr": 4e-3, + "betas": (0.9, 0.999), + "weight_decay": 0.05, +} + +from transformers import Trainer +trainer = Trainer(..., optimizer_cls_and_kwargs=(optimizer_cls, optimizer_kwargs)) +``` + +### GaLore Gradient Low-Rank Projection (GaLore) is a memory-efficient low-rank training strategy that allows full-parameter learning but is more memory-efficient than common low-rank adaptation methods, such as LoRA. @@ -382,42 +445,7 @@ trainer.train() Note layerwise optimization is a bit experimental and does not support DDP (Distributed Data Parallel), thus you can run the training script only on a single GPU. Please see [this appropriate section](https://github.com/jiaweizzhao/GaLore?tab=readme-ov-file#train-7b-model-with-a-single-gpu-with-24gb-memory) for more details. Other features such as gradient clipping, DeepSpeed, etc might not be supported out of the box. Please [raise an issue on GitHub](https://github.com/huggingface/transformers/issues) if you encounter such issue. -## Liger Kernel - -[Liger-Kernel](https://github.com/linkedin/Liger-Kernel) Kernel is a collection of Triton kernels developed by Linkedin designed specifically for LLM training. We have implemented Hugging Face Compatible RMSNorm, RoPE, SwiGLU, CrossEntropy, FusedLinearCrossEntropy, and more to come. It can effectively increase multi-GPU training throughput by 20% and reduces memory usage by 60%. The kernel works out of the box with flash attention, PyTorch FSDP, and Microsoft DeepSpeed. - - -Gain +20% throughput and reduce memory usage by 60% on LLaMA 3-8B model training. Achieve longer context lengths and larger batch sizes. It’s also useful if you want to scale up your model to multi-head training or large vocabulary sizes. Unleash multi-head training (medusa) and more. See details and examples in [Liger](https://github.com/linkedin/Liger-Kernel/tree/main/examples) - - -First make sure to install Liger official repository: -```bash -pip install liger-kernel -``` - -You should pass `use_liger_kernel=True` to apply liger kernel on your model, for example: - -```py -from transformers import TrainingArguments - -training_args = TrainingArguments( - output_dir="your-model", - learning_rate=2e-5, - per_device_train_batch_size=16, - per_device_eval_batch_size=16, - num_train_epochs=2, - weight_decay=0.01, - eval_strategy="epoch", - save_strategy="epoch", - load_best_model_at_end=True, - push_to_hub=True, - use_liger_kernel=True -) -``` - -The kernel supports the Llama, Gemma, Mistral, and Mixtral model architectures. The most up-to-date list of supported models can be found [here](https://github.com/linkedin/Liger-Kernel). When `use_liger_kernel` is set to `True`, the corresponding layers in the original model will be patched with Liger's efficient implementation, so you don't need to do anything extra other than setting the argument value. - -## LOMO optimizer +### LOMO optimizer The LOMO optimizers have been introduced in [Full Parameter Fine-Tuning for Large Language Models with Limited Resources](https://hf.co/papers/2306.09782) and [AdaLomo: Low-memory Optimization with Adaptive Learning Rate](https://hf.co/papers/2310.10195). They both consist of an efficient full-parameter fine-tuning method. These optimizers fuse the gradient computation and the parameter update in one step to reduce memory usage. Supported optimizers for LOMO are `"lomo"` and `"adalomo"`. First either install LOMO from pypi `pip install lomo-optim` or install it from source with `pip install git+https://github.com/OpenLMLab/LOMO.git`. @@ -467,7 +495,7 @@ trainer = trl.SFTTrainer( trainer.train() ``` -## GrokAdamW optimizer +### GrokAdamW optimizer The GrokAdamW optimizer is designed to enhance training performance and stability, particularly for models that benefit from grokking signal functions. To use GrokAdamW, first install the optimizer package with `pip install grokadamw`. @@ -518,7 +546,7 @@ trainer.train() This script demonstrates how to fine-tune the `google/gemma-2b` model on the IMDB dataset using the GrokAdamW optimizer. The `TrainingArguments` are configured to use GrokAdamW, and the dataset is passed to the `Trainer` for training. -## Schedule Free Optimizer +### Schedule Free Optimizer The Schedule Free optimizers have been introduced in [The Road Less Scheduled](https://hf.co/papers/2405.15682). Schedule-Free learning replaces the momentum of the base optimizer with a combination of averaging and interpolation, to completely remove the need to anneal the learning rate with a traditional schedule. diff --git a/docs/source/hi/_toctree.yml b/docs/source/hi/_toctree.yml index 546a8663cc4d..72759457a5c8 100644 --- a/docs/source/hi/_toctree.yml +++ b/docs/source/hi/_toctree.yml @@ -1,3 +1,7 @@ - sections: - local: pipeline_tutorial - title: पाइपलाइनों के साथ अनुमान चलाएँ \ No newline at end of file + title: पाइपलाइनों के साथ अनुमान चलाएँ + - local: accelerate + title: 🤗 Accelerate के साथ वितरित प्रशिक्षण सेट करें + - local: tflite + title: TFLite में निर्यात करें \ No newline at end of file diff --git a/docs/source/hi/accelerate.md b/docs/source/hi/accelerate.md new file mode 100644 index 000000000000..3d568217a129 --- /dev/null +++ b/docs/source/hi/accelerate.md @@ -0,0 +1,136 @@ + + +# वितरित प्रशिक्षण के साथ 🤗 Accelerate + +जैसे-जैसे मॉडल बड़े होते हैं, समानांतरता सीमित हार्डवेयर पर बड़े मॉडल को प्रशिक्षित करने और प्रशिक्षण की गति को कई आदेशों के आकार में तेज करने के लिए एक रणनीति के रूप में उभरी है। हगिंग फेस में, हमने उपयोगकर्ताओं को किसी भी प्रकार के वितरित सेटअप पर 🤗 ट्रांसफार्मर्स मॉडल को आसानी से प्रशिक्षित करने में मदद करने के लिए [🤗 Accelerate](https://huggingface.co/docs/accelerate) पुस्तकालय बनाया है, चाहे वह एक मशीन पर कई GPU हों या कई मशीनों में कई GPU। इस ट्यूटोरियल में, जानें कि अपने मूल PyTorch प्रशिक्षण लूप को कैसे अनुकूलित किया जाए ताकि वितरित वातावरण में प्रशिक्षण सक्षम हो सके। + +## सेटअप + +🤗 Accelerate स्थापित करके शुरू करें: + +```bash +pip install accelerate +``` + +फिर एक [`~accelerate.Accelerator`] ऑब्जेक्ट आयात करें और बनाएं। [`~accelerate.Accelerator`] स्वचालित रूप से आपके वितरित सेटअप के प्रकार का पता लगाएगा और प्रशिक्षण के लिए सभी आवश्यक घटकों को प्रारंभ करेगा। आपको अपने मॉडल को किसी डिवाइस पर स्पष्ट रूप से रखने की आवश्यकता नहीं है। + +```py +>>> from accelerate import Accelerator + +>>> accelerator = Accelerator() +``` + +## तेजी लाने की तैयारी + +अगला कदम सभी प्रासंगिक प्रशिक्षण वस्तुओं को [`~accelerate.Accelerator.prepare`] विधि में पास करना है। इसमें आपके प्रशिक्षण और मूल्यांकन DataLoaders, एक मॉडल और एक ऑप्टिमाइज़र शामिल हैं: + +```py +>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( +... train_dataloader, eval_dataloader, model, optimizer +... ) +``` + +## बैकवर्ड + +अंतिम जोड़ यह है कि आपके प्रशिक्षण लूप में सामान्य `loss.backward()` को 🤗 Accelerate के [`~accelerate.Accelerator.backward`] विधि से बदलें: + +```py +>>> for epoch in range(num_epochs): +... for batch in train_dataloader: +... outputs = model(**batch) +... loss = outputs.loss +... accelerator.backward(loss) + +... optimizer.step() +... lr_scheduler.step() +... optimizer.zero_grad() +... progress_bar.update(1) +``` + +जैसा कि आप निम्नलिखित कोड में देख सकते हैं, आपको वितरित प्रशिक्षण सक्षम करने के लिए अपने प्रशिक्षण लूप में केवल चार अतिरिक्त कोड की पंक्तियाँ जोड़ने की आवश्यकता है! + +```diff ++ from accelerate import Accelerator + from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler + ++ accelerator = Accelerator() + + model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) + optimizer = AdamW(model.parameters(), lr=3e-5) + +- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +- model.to(device) + ++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( ++ train_dataloader, eval_dataloader, model, optimizer ++ ) + + num_epochs = 3 + num_training_steps = num_epochs * len(train_dataloader) + lr_scheduler = get_scheduler( + "linear", + optimizer=optimizer, + num_warmup_steps=0, + num_training_steps=num_training_steps + ) + + progress_bar = tqdm(range(num_training_steps)) + + model.train() + for epoch in range(num_epochs): + for batch in train_dataloader: +- batch = {k: v.to(device) for k, v in batch.items()} + outputs = model(**batch) + loss = outputs.loss +- loss.backward() ++ accelerator.backward(loss) + + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) +``` + +## प्रशिक्षण + +एक बार जब आपने प्रासंगिक कोड की पंक्तियाँ जोड़ दी हैं, तो अपने प्रशिक्षण को स्क्रिप्ट या कोलैबोरेटरी जैसे नोटबुक में लॉन्च करें। + +### स्क्रिप्ट के साथ प्रशिक्षण + +यदि आप स्क्रिप्ट से अपना प्रशिक्षण चला रहे हैं, तो एक कॉन्फ़िगरेशन फ़ाइल बनाने और सहेजने के लिए निम्नलिखित कमांड चलाएँ: + +```bash +accelerate config +``` + +फिर अपने प्रशिक्षण को इस तरह लॉन्च करें: + +```bash +accelerate launch train.py +``` + +### नोटबुक के साथ प्रशिक्षण + +🤗 Accelerate एक नोटबुक में भी चल सकता है यदि आप Colaboratory के TPU का उपयोग करने की योजना बना रहे हैं। प्रशिक्षण के लिए जिम्मेदार सभी कोड को एक फ़ंक्शन में लपेटें, और इसे [`~accelerate.notebook_launcher`] में पास करें: + +```py +>>> from accelerate import notebook_launcher + +>>> notebook_launcher(training_function) +``` + +🤗 Accelerate और इसकी समृद्ध सुविधाओं के बारे में अधिक जानकारी के लिए, [दस्तावेज़ीकरण](https://huggingface.co/docs/accelerate) देखें। diff --git a/docs/source/hi/tflite.md b/docs/source/hi/tflite.md new file mode 100644 index 000000000000..5a84bed94266 --- /dev/null +++ b/docs/source/hi/tflite.md @@ -0,0 +1,55 @@ + + +# TFLite में निर्यात करें + +[TensorFlow Lite](https://www.tensorflow.org/lite/guide) एक हल्का ढांचा है जो मशीन लर्निंग मॉडल को संसाधन-सीमित उपकरणों, जैसे मोबाइल फोन, एम्बेडेड सिस्टम और इंटरनेट ऑफ थिंग्स (IoT) उपकरणों पर तैनात करने के लिए है। TFLite को इन उपकरणों पर सीमित गणनात्मक शक्ति, मेमोरी और ऊर्जा खपत के साथ मॉडल को कुशलता से ऑप्टिमाइज़ और चलाने के लिए डिज़ाइन किया गया है। एक TensorFlow Lite मॉडल को एक विशेष कुशल पोर्टेबल प्रारूप में दर्शाया जाता है जिसे `.tflite` फ़ाइल एक्सटेंशन द्वारा पहचाना जाता है। + +🤗 Optimum में `exporters.tflite` मॉड्यूल के माध्यम से 🤗 Transformers मॉडल को TFLite में निर्यात करने की कार्यक्षमता है। समर्थित मॉडल आर्किटेक्चर की सूची के लिए, कृपया [🤗 Optimum दस्तावेज़](https://huggingface.co/docs/optimum/exporters/tflite/overview) देखें। + +TFLite में एक मॉडल निर्यात करने के लिए, आवश्यक निर्भरताएँ स्थापित करें: + +```bash +pip install optimum[exporters-tf] +``` + +सभी उपलब्ध तर्कों की जांच करने के लिए, [🤗 Optimum दस्तावेज़](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model) देखें, +या कमांड लाइन में मदद देखें: + +```bash +optimum-cli export tflite --help +``` + +यदि आप 🤗 Hub से एक मॉडल का चेकपॉइंट निर्यात करना चाहते हैं, उदाहरण के लिए, `google-bert/bert-base-uncased`, निम्नलिखित कमांड चलाएँ: + +```bash +optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/ +``` + +आपको प्रगति को दर्शाते हुए लॉग दिखाई देंगे और यह दिखाएंगे कि परिणामस्वरूप `model.tflite` कहाँ सहेजा गया है, जैसे: + +```bash +Validating TFLite model... + -[✓] TFLite model output names match reference model (logits) + - Validating TFLite Model output "logits": + -[✓] (1, 128, 30522) matches (1, 128, 30522) + -[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05) +The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05: +- logits: max diff = 5.817413330078125e-05. + The exported model was saved at: bert_tflite +``` + +उपरोक्त उदाहरण 🤗 Hub से एक चेकपॉइंट निर्यात करने को दर्शाता है। जब एक स्थानीय मॉडल निर्यात करते हैं, तो पहले सुनिश्चित करें कि आपने मॉडल के वज़न और टोकनाइज़र फ़ाइलों को एक ही निर्देशिका (`local_path`) में सहेजा है। CLI का उपयोग करते समय, चेकपॉइंट नाम के बजाय `model` तर्क में `local_path` पास करें। diff --git a/docs/source/ja/main_classes/pipelines.md b/docs/source/ja/main_classes/pipelines.md index bfb9922057d3..3980becebbde 100644 --- a/docs/source/ja/main_classes/pipelines.md +++ b/docs/source/ja/main_classes/pipelines.md @@ -481,6 +481,12 @@ my_pipeline = pipeline(model="xxxx", pipeline_class=MyPipeline) - __call__ - all +### ImageTextToTextPipeline + +[[autodoc]] ImageTextToTextPipeline + - __call__ + - all + ### VisualQuestionAnsweringPipeline [[autodoc]] VisualQuestionAnsweringPipeline diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index 51d54b697b2d..20519157eddc 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -206,8 +206,8 @@ title: 다중 CPU에서 훈련하기 - local: perf_train_tpu_tf title: TensorFlow로 TPU에서 훈련하기 - - local: in_translation - title: (번역중) PyTorch training on Apple silicon + - local: perf_train_special + title: Apple 실리콘에서 PyTorch 학습 - local: perf_hardware title: 훈련용 사용자 맞춤형 하드웨어 - local: hpo_train diff --git a/docs/source/ko/perf_train_special.md b/docs/source/ko/perf_train_special.md new file mode 100644 index 000000000000..188db542f7c0 --- /dev/null +++ b/docs/source/ko/perf_train_special.md @@ -0,0 +1,63 @@ + + +# Apple 실리콘에서 Pytorch 학습 [[PyTorch training on Apple silicon]] + +이전에는 Mac에서 모델을 학습할 때 CPU만 사용할 수 있었습니다. 그러나 이제 PyTorch v1.12의 출시로 Apple의 실리콘 GPU를 사용하여 훨씬 더 빠른 성능으로 모델을 학습할 수 있게 되었습니다. 이는 Pytorch에서 Apple의 Metal Performance Shaders (MPS)를 백엔드로 통합하면서 가능해졌습니다. [MPS 백엔드](https://pytorch.org/docs/stable/notes/mps.html)는 Pytorch 연산을 Metal 세이더로 구현하고 이 모듈들을 mps 장치에서 실행할 수 있도록 지원합니다. + + + +일부 Pytorch 연산들은 아직 MPS에서 지원되지 않아 오류가 발생할 수 있습니다. 이를 방지하려면 환경 변수 `PYTORCH_ENABLE_MPS_FALLBACK=1` 를 설정하여 CPU 커널을 대신 사용하도록 해야 합니다(이때 `UserWarning`이 여전히 표시될 수 있습니다). + +
+ +다른 오류가 발생할 경우 [PyTorch](https://github.com/pytorch/pytorch/issues) 리포지토리에 이슈를 등록해주세요. 현재 [`Trainer`]는 MPS 백엔드만 통합하고 있습니다. + +
+ +`mps` 장치를 이용하면 다음과 같은 이점들을 얻을 수 있습니다: + +* 로컬에서 더 큰 네트워크나 배치 크기로 학습 가능 +* GPU의 통합 메모리 아키텍처로 인해 메모리에 직접 접근할 수 있어 데이터 로딩 지연 감소 +* 클라우드 기반 GPU나 추가 GPU가 필요 없으므로 비용 절감 가능 + +Pytorch가 설치되어 있는지 확인하고 시작하세요. MPS 가속은 macOS 12.3 이상에서 지원됩니다. + +```bash +pip install torch torchvision torchaudio +``` + +[`TrainingArguments`]는 `mps` 장치가 사용 가능한 경우 이를 기본적으로 사용하므로 장치를 따로 설정할 필요가 없습니다. 예를 들어, MPS 백엔드를 자동으로 활성화하여 [run_glue.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py) 스크립트를 아무 수정 없이 실행할 수 있습니다. + +```diff +export TASK_NAME=mrpc + +python examples/pytorch/text-classification/run_glue.py \ + --model_name_or_path google-bert/bert-base-cased \ + --task_name $TASK_NAME \ +- --use_mps_device \ + --do_train \ + --do_eval \ + --max_seq_length 128 \ + --per_device_train_batch_size 32 \ + --learning_rate 2e-5 \ + --num_train_epochs 3 \ + --output_dir /tmp/$TASK_NAME/ \ + --overwrite_output_dir +``` + +`gloco`와 `nccl`과 같은 [분산 학습 백엔드](https://pytorch.org/docs/stable/distributed.html#backends)는 `mps` 장치에서 지원되지 않으므로, MPS 백엔드에서는 단일 GPU로만 학습이 가능합니다. + +Mac에서 가속된 PyTorch 학습에 대한 더 자세한 내용은 [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/) 블로그 게시물에서 확인할 수 있습니다. diff --git a/docs/source/zh/main_classes/pipelines.md b/docs/source/zh/main_classes/pipelines.md index 370b50d24696..bc16709d8b48 100644 --- a/docs/source/zh/main_classes/pipelines.md +++ b/docs/source/zh/main_classes/pipelines.md @@ -455,6 +455,12 @@ See [`TokenClassificationPipeline`] for all details. - __call__ - all +### ImageTextToTextPipeline + +[[autodoc]] ImageTextToTextPipeline + - __call__ + - all + ### MaskGenerationPipeline [[autodoc]] MaskGenerationPipeline diff --git a/examples/modular-transformers/configuration_my_new_model.py b/examples/modular-transformers/configuration_my_new_model.py index 3c7848e69569..aa0aac55ba91 100644 --- a/examples/modular-transformers/configuration_my_new_model.py +++ b/examples/modular-transformers/configuration_my_new_model.py @@ -1,9 +1,9 @@ -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -# This file was automatically generated from . -# Do NOT edit this file manually as any edits will be overwritten by the generation of -# the file from the modular. If any change should be done, please apply the change to the -# modular_xxx.py file directly. One of our CI enforces this -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from examples/modular-transformers/modular_my_new_model.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_my_new_model.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 from ...configuration_utils import PretrainedConfig from ...modeling_rope_utils import rope_config_validation @@ -158,6 +158,13 @@ def __init__( new_param=0, **kwargs, ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -187,11 +194,3 @@ def __init__( self.rope_scaling["rope_type"] = self.rope_scaling["type"] rope_config_validation(self) self.new_param = new_param - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) diff --git a/examples/modular-transformers/configuration_my_new_model2.py b/examples/modular-transformers/configuration_my_new_model2.py index 5fef1cecc702..f05ace94b622 100644 --- a/examples/modular-transformers/configuration_my_new_model2.py +++ b/examples/modular-transformers/configuration_my_new_model2.py @@ -1,9 +1,9 @@ -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -# This file was automatically generated from . -# Do NOT edit this file manually as any edits will be overwritten by the generation of -# the file from the modular. If any change should be done, please apply the change to the -# modular_xxx.py file directly. One of our CI enforces this -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from examples/modular-transformers/modular_my_new_model2.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_my_new_model2.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 from ...configuration_utils import PretrainedConfig from ...modeling_rope_utils import rope_config_validation @@ -11,106 +11,6 @@ class MyNewModel2Config(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`MyNewModel2Model`]. It is used to instantiate an MyNewModel2 - model according to the specified arguments, defining the model architecture. Instantiating a configuration with the - defaults will yield a similar configuration to that of the MyNewModel2-7B. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - - Args: - vocab_size (`int`, *optional*, defaults to 32000): - Vocabulary size of the MyNewModel2 model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`MyNewModel2Model`] - hidden_size (`int`, *optional*, defaults to 4096): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 11008): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 32): - Number of hidden layers in the Transformer decoder. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the Transformer decoder. - num_key_value_heads (`int`, *optional*): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to - `num_attention_heads`. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to 2048): - The maximum sequence length that this model might ever be used with. MyNewModel2 1 supports up to 2048 tokens, - MyNewModel2 2 up to 4096, CodeMyNewModel2 up to 16384. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - pad_token_id (`int`, *optional*): - Padding token id. - bos_token_id (`int`, *optional*, defaults to 1): - Beginning of stream token id. - eos_token_id (`int`, *optional*, defaults to 2): - End of stream token id. - pretraining_tp (`int`, *optional*, defaults to 1): - Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this - document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to - understand more about it. This value is necessary to ensure exact reproducibility of the pretraining - results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'my_new_model23'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'my_new_model23'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'my_new_model23'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'my_new_model23'. Scaling factor applied to high frequency components of the RoPE - attention_bias (`bool`, *optional*, defaults to `False`): - Whether to use a bias in the query, key, value and output projection layers during self-attention. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - mlp_bias (`bool`, *optional*, defaults to `False`): - Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. - head_dim (`int`, *optional*): - The attention head dimension. If None, it will default to hidden_size // num_heads This is the configuration class to store the configuration of a [`GemmaModel`]. It is used to instantiate an Gemma model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the Gemma-7B. @@ -121,7 +21,6 @@ class MyNewModel2Config(PretrainedConfig): vocab_size (`int`, *optional*, defaults to 256000): Vocabulary size of the Gemma model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`GemmaModel`] - ```python >>> from transformers import GemmaModel, GemmaConfig >>> # Initializing a Gemma gemma-7b style configuration diff --git a/examples/modular-transformers/configuration_new_model.py b/examples/modular-transformers/configuration_new_model.py index 8bc8ef52cee6..4d164fe3e75f 100644 --- a/examples/modular-transformers/configuration_new_model.py +++ b/examples/modular-transformers/configuration_new_model.py @@ -1,9 +1,9 @@ -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -# This file was automatically generated from . -# Do NOT edit this file manually as any edits will be overwritten by the generation of -# the file from the modular. If any change should be done, please apply the change to the -# modular_xxx.py file directly. One of our CI enforces this -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from examples/modular-transformers/modular_new_model.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_new_model.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # Example where we only want to overwrite the defaults of an init from ...configuration_utils import PretrainedConfig @@ -104,6 +104,13 @@ def __init__( attention_dropout=0.0, **kwargs, ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -121,14 +128,6 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - @property def num_heads(self): return self.num_attention_heads diff --git a/examples/modular-transformers/modeling_dummy.py b/examples/modular-transformers/modeling_dummy.py index b5b1fc6aec85..ed7e3c64d7a8 100644 --- a/examples/modular-transformers/modeling_dummy.py +++ b/examples/modular-transformers/modeling_dummy.py @@ -1,26 +1,24 @@ -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -# This file was automatically generated from . -# Do NOT edit this file manually as any edits will be overwritten by the generation of -# the file from the modular. If any change should be done, please apply the change to the -# modular_xxx.py file directly. One of our CI enforces this -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from examples/modular-transformers/modular_dummy.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_dummy.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 import math from typing import List, Optional, Tuple, Union import torch import torch.nn.functional as F -import torch.utils.checkpoint from torch import nn from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, StaticCache from ...modeling_attn_mask_utils import AttentionMaskConverter -from ...modeling_flash_attention_utils import _flash_attention_forward -from ...modeling_outputs import ( - BaseModelOutputWithPast, -) +from ...modeling_flash_attention_utils import FlashAttentionKwargs, _flash_attention_forward +from ...modeling_outputs import BaseModelOutputWithPast from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack from ...utils import ( add_start_docstrings, add_start_docstrings_to_model_forward, @@ -33,59 +31,6 @@ logger = logging.get_logger(__name__) -def _prepare_4d_causal_attention_mask_with_cache_position( - attention_mask: torch.Tensor, - sequence_length: int, - target_length: int, - dtype: torch.dtype, - device: torch.device, - min_dtype: float, - cache_position: torch.Tensor, - batch_size: int, -): - """ - Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape - `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. - - Args: - attention_mask (`torch.Tensor`): - A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`. - sequence_length (`int`): - The sequence length being processed. - target_length (`int`): - The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet. - dtype (`torch.dtype`): - The dtype to use for the 4D attention mask. - device (`torch.device`): - The device to plcae the 4D attention mask on. - min_dtype (`float`): - The minimum value representable with the dtype `dtype`. - cache_position (`torch.Tensor`): - Indices depicting the position of the input sequence tokens in the sequence. - batch_size (`torch.Tensor`): - Batch size. - """ - if attention_mask is not None and attention_mask.dim() == 4: - # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. - causal_mask = attention_mask - else: - causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device) - if sequence_length != 1: - causal_mask = torch.triu(causal_mask, diagonal=1) - causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) - causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) - if attention_mask is not None: - causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit - mask_length = attention_mask.shape[-1] - padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] - padding_mask = padding_mask == 0 - causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( - padding_mask, min_dtype - ) - - return causal_mask - - class DummyRMSNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): """ @@ -193,40 +138,6 @@ def forward(self, x, position_ids): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 4] - x2 = x[..., x.shape[-1] // 4 :] - return torch.cat((-x2, x1), dim=-1) - - -def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): - """Applies Rotary Position Embedding to the query and key tensors. - - Args: - q (`torch.Tensor`): The query tensor. - k (`torch.Tensor`): The key tensor. - cos (`torch.Tensor`): The cosine part of the rotary embedding. - sin (`torch.Tensor`): The sine part of the rotary embedding. - position_ids (`torch.Tensor`, *optional*): - Deprecated and unused. - unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and - sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note - that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and - k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes - cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. - Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. - """ - cos = cos.unsqueeze(unsqueeze_dim) - sin = sin.unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - class DummyMLP(nn.Module): def __init__(self, config): super().__init__() @@ -261,6 +172,40 @@ def forward(self, x): return down_proj +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 4] + x2 = x[..., x.shape[-1] // 4 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: """ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, @@ -423,6 +368,7 @@ def forward( use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 + **kwargs: Unpack[FlashAttentionKwargs], ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if isinstance(past_key_value, StaticCache): raise ValueError( @@ -507,6 +453,7 @@ def forward( sliding_window=getattr(self, "sliding_window", None), use_top_left_mask=self._flash_attn_uses_top_left_mask, is_causal=self.is_causal, + **kwargs, ) attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() @@ -871,6 +818,7 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, + **flash_attn_kwargs: Unpack[FlashAttentionKwargs], ) -> Union[Tuple, BaseModelOutputWithPast]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -952,6 +900,7 @@ def forward( use_cache=use_cache, cache_position=cache_position, position_embeddings=position_embeddings, + **flash_attn_kwargs, ) hidden_states = layer_outputs[0] @@ -1011,10 +960,9 @@ def _update_causal_mask( return None dtype, device = input_tensor.dtype, input_tensor.device - min_dtype = torch.finfo(dtype).min sequence_length = input_tensor.shape[1] if using_static_cache: - target_length = past_key_values.get_max_length() + target_length = past_key_values.get_max_cache_shape() else: target_length = ( attention_mask.shape[-1] @@ -1023,13 +971,12 @@ def _update_causal_mask( ) # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). - causal_mask = _prepare_4d_causal_attention_mask_with_cache_position( + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( attention_mask, sequence_length=sequence_length, target_length=target_length, dtype=dtype, device=device, - min_dtype=min_dtype, cache_position=cache_position, batch_size=input_tensor.shape[0], ) @@ -1043,6 +990,63 @@ def _update_causal_mask( # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) return causal_mask + + @staticmethod + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask diff --git a/examples/modular-transformers/modeling_dummy_bert.py b/examples/modular-transformers/modeling_dummy_bert.py index 611d7be961f7..e18e6a19e8a3 100644 --- a/examples/modular-transformers/modeling_dummy_bert.py +++ b/examples/modular-transformers/modeling_dummy_bert.py @@ -1,27 +1,20 @@ -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -# This file was automatically generated from . -# Do NOT edit this file manually as any edits will be overwritten by the generation of -# the file from the modular. If any change should be done, please apply the change to the -# modular_xxx.py file directly. One of our CI enforces this -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from examples/modular-transformers/modular_dummy_bert.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_dummy_bert.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 import math import os from typing import List, Optional, Tuple, Union import torch -import torch.utils.checkpoint from packaging import version from torch import nn from ...activations import ACT2FN -from ...modeling_attn_mask_utils import ( - _prepare_4d_attention_mask_for_sdpa, - _prepare_4d_causal_attention_mask_for_sdpa, -) -from ...modeling_outputs import ( - BaseModelOutputWithPastAndCrossAttentions, - BaseModelOutputWithPoolingAndCrossAttentions, -) +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa +from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( @@ -40,79 +33,6 @@ _CONFIG_FOR_DOC = "DummyBertConfig" -def load_tf_weights_in_dummy_bert(model, config, tf_checkpoint_path): - """Load tf checkpoints in a pytorch model.""" - try: - import re - - import numpy as np - import tensorflow as tf - except ImportError: - logger.error( - "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " - "https://www.tensorflow.org/install/ for installation instructions." - ) - raise - tf_path = os.path.abspath(tf_checkpoint_path) - logger.info(f"Converting TensorFlow checkpoint from {tf_path}") - # Load weights from TF model - init_vars = tf.train.list_variables(tf_path) - names = [] - arrays = [] - for name, shape in init_vars: - logger.info(f"Loading TF weight {name} with shape {shape}") - array = tf.train.load_variable(tf_path, name) - names.append(name) - arrays.append(array) - - for name, array in zip(names, arrays): - name = name.split("/") - # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v - # which are not required for using pretrained model - if any( - n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] - for n in name - ): - logger.info(f"Skipping {'/'.join(name)}") - continue - pointer = model - for m_name in name: - if re.fullmatch(r"[A-Za-z]+_\d+", m_name): - scope_names = re.split(r"_(\d+)", m_name) - else: - scope_names = [m_name] - if scope_names[0] == "kernel" or scope_names[0] == "gamma": - pointer = getattr(pointer, "weight") - elif scope_names[0] == "output_bias" or scope_names[0] == "beta": - pointer = getattr(pointer, "bias") - elif scope_names[0] == "output_weights": - pointer = getattr(pointer, "weight") - elif scope_names[0] == "squad": - pointer = getattr(pointer, "classifier") - else: - try: - pointer = getattr(pointer, scope_names[0]) - except AttributeError: - logger.info(f"Skipping {'/'.join(name)}") - continue - if len(scope_names) >= 2: - num = int(scope_names[1]) - pointer = pointer[num] - if m_name[-11:] == "_embeddings": - pointer = getattr(pointer, "weight") - elif m_name == "kernel": - array = np.transpose(array) - try: - if pointer.shape != array.shape: - raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") - except ValueError as e: - e.args += (pointer.shape, array.shape) - raise - logger.info(f"Initialize PyTorch weight {name}") - pointer.data = torch.from_numpy(array) - return model - - class DummyBertEmbeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings.""" @@ -706,6 +626,79 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return pooled_output +def load_tf_weights_in_dummy_bert(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split("/") + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any( + n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] + for n in name + ): + logger.info(f"Skipping {'/'.join(name)}") + continue + pointer = model + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "output_weights": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "squad": + pointer = getattr(pointer, "classifier") + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info(f"Skipping {'/'.join(name)}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if m_name[-11:] == "_embeddings": + pointer = getattr(pointer, "weight") + elif m_name == "kernel": + array = np.transpose(array) + try: + if pointer.shape != array.shape: + raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") + except ValueError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info(f"Initialize PyTorch weight {name}") + pointer.data = torch.from_numpy(array) + return model + + class DummyBertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained @@ -871,26 +864,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: - r""" - encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if - the model is configured as a decoder. - encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, target_length)`, *optional*): - Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in - the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): - Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. - - If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that - don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all - `decoder_input_ids` of shape `(batch_size, sequence_length)`. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - """ r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1027,7 +1000,6 @@ def forward( if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] - return super().forward(input_ids) return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, diff --git a/examples/modular-transformers/modeling_my_new_model2.py b/examples/modular-transformers/modeling_my_new_model2.py index 49cdd2741620..16f9e525a05e 100644 --- a/examples/modular-transformers/modeling_my_new_model2.py +++ b/examples/modular-transformers/modeling_my_new_model2.py @@ -1,25 +1,20 @@ -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -# This file was automatically generated from . -# Do NOT edit this file manually as any edits will be overwritten by the generation of -# the file from the modular. If any change should be done, please apply the change to the -# modular_xxx.py file directly. One of our CI enforces this -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from examples/modular-transformers/modular_my_new_model2.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_my_new_model2.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 import math from typing import List, Optional, Tuple, Union import torch -import torch.utils.checkpoint from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, StaticCache from ...modeling_attn_mask_utils import AttentionMaskConverter from ...modeling_flash_attention_utils import _flash_attention_forward -from ...modeling_outputs import ( - BaseModelOutputWithPast, - SequenceClassifierOutputWithPast, -) +from ...modeling_outputs import BaseModelOutputWithPast, SequenceClassifierOutputWithPast from ...modeling_utils import PreTrainedModel from ...utils import ( add_start_docstrings, @@ -30,6 +25,9 @@ from .configuration_my_new_model2 import MyNewModel2Config +logger = logging.get_logger(__name__) + + class MyNewModel2RMSNorm(nn.Module): def __init__(self, dim: int, eps: float = 1e-6): super().__init__() @@ -50,9 +48,6 @@ def extra_repr(self): return f"{tuple(self.weight.shape)}, eps={self.eps}" -logger = logging.get_logger(__name__) - - class MyNewModel2RotaryEmbedding(nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): super().__init__() @@ -448,59 +443,6 @@ def forward( return attn_output, attn_weights, past_key_value -def _prepare_4d_causal_attention_mask_with_cache_position( - attention_mask: torch.Tensor, - sequence_length: int, - target_length: int, - dtype: torch.dtype, - device: torch.device, - min_dtype: float, - cache_position: torch.Tensor, - batch_size: int, -): - """ - Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape - `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. - - Args: - attention_mask (`torch.Tensor`): - A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`. - sequence_length (`int`): - The sequence length being processed. - target_length (`int`): - The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet. - dtype (`torch.dtype`): - The dtype to use for the 4D attention mask. - device (`torch.device`): - The device to plcae the 4D attention mask on. - min_dtype (`float`): - The minimum value representable with the dtype `dtype`. - cache_position (`torch.Tensor`): - Indices depicting the position of the input sequence tokens in the sequence. - batch_size (`torch.Tensor`): - Batch size. - """ - if attention_mask is not None and attention_mask.dim() == 4: - # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. - causal_mask = attention_mask - else: - causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device) - if sequence_length != 1: - causal_mask = torch.triu(causal_mask, diagonal=1) - causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) - causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) - if attention_mask is not None: - causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit - mask_length = attention_mask.shape[-1] - padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] - padding_mask = padding_mask == 0 - causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( - padding_mask, min_dtype - ) - - return causal_mask - - MY_NEW_MODEL2_ATTENTION_CLASSES = { "eager": MyNewModel2Attention, "flash_attention_2": MyNewModel2FlashAttention2, @@ -893,10 +835,9 @@ def _update_causal_mask( return None dtype, device = input_tensor.dtype, input_tensor.device - min_dtype = torch.finfo(dtype).min sequence_length = input_tensor.shape[1] if using_static_cache: - target_length = past_key_values.get_max_length() + target_length = past_key_values.get_max_cache_shape() else: target_length = ( attention_mask.shape[-1] @@ -905,13 +846,12 @@ def _update_causal_mask( ) # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). - causal_mask = _prepare_4d_causal_attention_mask_with_cache_position( + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( attention_mask, sequence_length=sequence_length, target_length=target_length, dtype=dtype, device=device, - min_dtype=min_dtype, cache_position=cache_position, batch_size=input_tensor.shape[0], ) @@ -925,10 +865,67 @@ def _update_causal_mask( # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) return causal_mask + @staticmethod + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + @add_start_docstrings( """ @@ -1019,27 +1016,8 @@ def forward( loss = None if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) + if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/examples/modular-transformers/modeling_new_task_model.py b/examples/modular-transformers/modeling_new_task_model.py index 640331ace1d5..4556308f1ea0 100644 --- a/examples/modular-transformers/modeling_new_task_model.py +++ b/examples/modular-transformers/modeling_new_task_model.py @@ -8,7 +8,6 @@ from typing import ClassVar, List, Optional, Tuple, Union import torch -import torch.utils.checkpoint from torch import nn from ...cache_utils import Cache, StaticCache @@ -18,92 +17,15 @@ ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, - is_flash_attn_2_available, - logging, replace_return_docstrings, ) -from .configuration_new_task_model import NewTaskModelConfig - - -if is_flash_attn_2_available(): - from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa - from ..auto import AutoModel, AutoModelForCausalLM +from .configuration_new_task_model import NewTaskModelConfig -logger = logging.get_logger(__name__) - _CONFIG_FOR_DOC = "NewTaskModelConfig" -# Adapted from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position -# But NewTaskModel has no causal mask on prefix -def _prepare_4d_causal_attention_mask_with_cache_position( - attention_mask: torch.Tensor, - sequence_length: int, - target_length: int, - dtype: torch.dtype, - device: torch.device, - min_dtype: float, - cache_position: torch.Tensor, - batch_size: int, - is_training: bool = False, - token_type_ids: torch.Tensor = None, -): - """ - Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape - `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. - - Args: - attention_mask (`torch.Tensor`): - A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`. - sequence_length (`int`): - The sequence length being processed. - target_length (`int`): - The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet. - dtype (`torch.dtype`): - The dtype to use for the 4D attention mask. - device (`torch.device`): - The device to plcae the 4D attention mask on. - min_dtype (`float`): - The minimum value representable with the dtype `dtype`. - cache_position (`torch.Tensor`): - Indices depicting the position of the input sequence tokens in the sequence. - batch_size (`torch.Tensor`): - Batch size. - is_training (`bool`): - Whether the model is in training mode or in inference. The condition is checked by presence/absence of `token_type_ids/labels` - """ - if attention_mask is not None and attention_mask.dim() == 4: - # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. - causal_mask = attention_mask - else: - causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device) - # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below - if sequence_length != 1: - if is_training: - causal_mask = torch.triu(causal_mask, diagonal=1) - else: - causal_mask[:, :sequence_length] = 0.0 - - causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1) - causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) - if attention_mask is not None: - causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit - mask_length = attention_mask.shape[-1] - padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device) - padding_mask = padding_mask == 0 - causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( - padding_mask, min_dtype - ) - # we are training thus we need to create a full mask on the image + prefix but causal on suffix - if is_training: - causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( - token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0 - ) - return causal_mask - - @dataclass class NewTaskModelCausalLMOutputWithPast(ModelOutput): """ @@ -182,12 +104,12 @@ class NewTaskModelPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["NewTaskModelMultiModalProjector"] _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = False _supports_cache_class = True _supports_quantized_cache = True _supports_static_cache = True - _supports_sdpa = True _supports_cache_class = True + _supports_flash_attn_2 = True + _supports_sdpa = True def _init_weights(self, module): # important: this ported version of NewTaskModelisn't meant for training from scratch - only @@ -210,14 +132,6 @@ def _init_weights(self, module): if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() - @property - def _supports_sdpa(self): - """ - Retrieve language_model's attribute to check whether the model supports - SDPA or not. - """ - return self.language_model._supports_sdpa - NEW_TASK_MODEL_INPUTS_DOCSTRING = r""" Args: @@ -301,11 +215,8 @@ def __init__(self, config): self.vision_tower = AutoModel.from_config(config=config.vision_config) self.multi_modal_projector = NewTaskModelMultiModalProjector(config) self.vocab_size = config.text_config.vocab_size - self._attn_implementation = config._attn_implementation - language_model = AutoModelForCausalLM.from_config( - config=config.text_config, attn_implementation=self._attn_implementation - ) + language_model = AutoModelForCausalLM.from_config(config=config.text_config) if language_model._tied_weights_keys is not None: self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys] @@ -344,6 +255,11 @@ def tie_weights(self): def _update_causal_mask( self, attention_mask, token_type_ids, inputs_embeds, past_key_values, cache_position, is_training: bool = False ): + if self.config.text_config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + using_static_cache = isinstance(past_key_values, StaticCache) dtype = inputs_embeds.dtype min_dtype = torch.finfo(dtype).min @@ -388,6 +304,22 @@ def _update_causal_mask( ) return causal_mask + def get_image_features(self, pixel_values: torch.FloatTensor): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) + The tensors corresponding to the input images. + Returns: + image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). + """ + image_outputs = self.vision_tower(pixel_values) + selected_image_feature = image_outputs.last_hidden_state + image_features = self.multi_modal_projector(selected_image_feature) + image_features = image_features / (self.config.hidden_size**0.5) + return image_features + @add_start_docstrings_to_model_forward(NEW_TASK_MODEL_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=NewTaskModelCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) def forward( @@ -426,9 +358,9 @@ def forward( ```python >>> from PIL import Image >>> import requests - >>> from transformers import AutoProcessor, NewTaskModelForNewTask + >>> from transformers import AutoProcessor, NewTaskModelForConditionalGeneration - >>> model = NewTaskModelForNewTask.from_pretrained("google/NewTaskModel-test-224px-hf") + >>> model = NewTaskModelForConditionalGeneration.from_pretrained("google/NewTaskModel-test-224px-hf") >>> processor = AutoProcessor.from_pretrained("google/NewTaskModel-test-224px-hf") >>> prompt = "answer en Where is the cow standing?" @@ -484,6 +416,7 @@ def prepare_inputs_for_generation( num_logits_to_keep=None, **kwargs, ): + # Overwritten -- custom `position_ids` and `pixel_values` handling model_inputs = self.language_model.prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, @@ -493,33 +426,10 @@ def prepare_inputs_for_generation( cache_position=cache_position, use_cache=use_cache, num_logits_to_keep=num_logits_to_keep, + token_type_ids=token_type_ids, **kwargs, ) - if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2: - if model_inputs["inputs_embeds"] is not None: - batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape - device = model_inputs["inputs_embeds"].device - else: - batch_size, sequence_length = model_inputs["input_ids"].shape - device = model_inputs["input_ids"].device - - dtype = self.get_output_embeddings().weight.dtype - min_dtype = torch.finfo(dtype).min - - model_inputs["attention_mask"] = _prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=past_key_values.get_max_length(), - dtype=dtype, - device=device, - min_dtype=min_dtype, - cache_position=cache_position, - batch_size=batch_size, - ) - - model_inputs["token_type_ids"] = token_type_ids - # position_ids in NewTaskModel are 1-indexed if model_inputs.get("position_ids") is not None: model_inputs["position_ids"] += 1 diff --git a/examples/modular-transformers/modeling_roberta.py b/examples/modular-transformers/modeling_roberta.py new file mode 100644 index 000000000000..e50cf60c3a4e --- /dev/null +++ b/examples/modular-transformers/modeling_roberta.py @@ -0,0 +1,1014 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from examples/modular-transformers/modular_roberta.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_roberta.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +import math +import os +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from packaging import version + +from ...activations import ACT2FN +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa +from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + get_torch_version, + logging, +) +from .configuration_roberta import RobertaConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "google-roberta/roberta-base-uncased" +_CONFIG_FOR_DOC = "RobertaConfig" + + +class RobertaEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size, config.pad_token_id + ) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + self.pad_token_id = config.pad_token_id + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class RobertaSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + use_cache = past_key_value is not None + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +class RobertaSdpaSelfAttention(RobertaSelfAttention): + def __init__(self, config, position_embedding_type=None): + super().__init__(config, position_embedding_type=position_embedding_type) + self.dropout_prob = config.attention_probs_dropout_prob + self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0") + + # Adapted from RobertaSelfAttention + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None: + # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented. + logger.warning_once( + "RobertaSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support " + "non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to " + "the manual attention implementation, but specifying the manual implementation will be required from " + "Transformers version v5.0.0 onwards. This warning can be removed using the argument " + '`attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + bsz, tgt_len, _ = hidden_states.size() + + query_layer = self.transpose_for_scores(self.query(hidden_states)) + + # If this is instantiated as a cross-attention module, the keys and values come from an encoder; the attention + # mask needs to be such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + current_states = encoder_hidden_states if is_cross_attention else hidden_states + attention_mask = encoder_attention_mask if is_cross_attention else attention_mask + + # Check `seq_length` of `past_key_value` == `len(current_states)` to support prefix tuning + if is_cross_attention and past_key_value and past_key_value[0].shape[2] == current_states.shape[1]: + key_layer, value_layer = past_key_value + else: + key_layer = self.transpose_for_scores(self.key(current_states)) + value_layer = self.transpose_for_scores(self.value(current_states)) + if past_key_value is not None and not is_cross_attention: + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom + # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0. + # Reference: https://github.com/pytorch/pytorch/issues/112577 + if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None: + query_layer = query_layer.contiguous() + key_layer = key_layer.contiguous() + value_layer = value_layer.contiguous() + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create + # a causal mask in case tgt_len == 1. + is_causal = ( + True if self.is_decoder and not is_cross_attention and attention_mask is None and tgt_len > 1 else False + ) + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + attn_mask=attention_mask, + dropout_p=self.dropout_prob if self.training else 0.0, + is_causal=is_causal, + ) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size) + + outputs = (attn_output,) + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +class RobertaSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +ROBERTA_SELF_ATTENTION_CLASSES = { + "eager": RobertaSelfAttention, + "sdpa": RobertaSdpaSelfAttention, +} + + +class RobertaAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self = ROBERTA_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) + self.output = RobertaSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class RobertaIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class RobertaOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class RobertaLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = RobertaAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = RobertaAttention(config, position_embedding_type="absolute") + self.intermediate = RobertaIntermediate(config) + self.output = RobertaOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class RobertaEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([RobertaLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class RobertaPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +def load_tf_weights_in_roberta(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split("/") + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any( + n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] + for n in name + ): + logger.info(f"Skipping {'/'.join(name)}") + continue + pointer = model + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "output_weights": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "squad": + pointer = getattr(pointer, "classifier") + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info(f"Skipping {'/'.join(name)}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if m_name[-11:] == "_embeddings": + pointer = getattr(pointer, "weight") + elif m_name == "kernel": + array = np.transpose(array) + try: + if pointer.shape != array.shape: + raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") + except ValueError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info(f"Initialize PyTorch weight {name}") + pointer.data = torch.from_numpy(array) + return model + + +class RobertaPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = RobertaConfig + load_tf_weights = load_tf_weights_in_roberta + base_model_prefix = "roberta" + supports_gradient_checkpointing = True + _supports_sdpa = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +ROBERTA_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`RobertaConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +ROBERTA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`or `(batch_size, sequence_length, target_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Roberta Model transformer outputting raw hidden-states without any specific head on top.", + ROBERTA_START_DOCSTRING, +) +class RobertaModel(RobertaPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set + to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + _no_split_modules = ["RobertaEmbeddings", "RobertaLayer"] + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = RobertaEmbeddings(config) + self.encoder = RobertaEncoder(config) + + self.pooler = RobertaPooler(config) if add_pooling_layer else None + + self.attn_implementation = config._attn_implementation + self.position_embedding_type = config.position_embedding_type + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPoolingAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, target_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is None: + attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device) + + use_sdpa_attention_masks = ( + self.attn_implementation == "sdpa" + and self.position_embedding_type == "absolute" + and head_mask is None + and not output_attentions + ) + + # Expand the attention mask + if use_sdpa_attention_masks and attention_mask.dim() == 2: + # Expand the attention mask for SDPA. + # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] + if self.config.is_decoder: + extended_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + input_shape, + embedding_output, + past_key_values_length, + ) + else: + extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( + attention_mask, embedding_output.dtype, tgt_len=seq_length + ) + else: + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + + if use_sdpa_attention_masks and encoder_attention_mask.dim() == 2: + # Expand the attention mask for SDPA. + # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] + encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( + encoder_attention_mask, embedding_output.dtype, tgt_len=seq_length + ) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) diff --git a/examples/modular-transformers/modeling_super.py b/examples/modular-transformers/modeling_super.py index d91bdb1820c2..7df04bcc2a99 100644 --- a/examples/modular-transformers/modeling_super.py +++ b/examples/modular-transformers/modeling_super.py @@ -1,26 +1,24 @@ -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -# This file was automatically generated from . -# Do NOT edit this file manually as any edits will be overwritten by the generation of -# the file from the diff. If any change should be done, please apply the change to the -# diff.py file directly. One of our CI enforces this -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from examples/modular-transformers/modular_super.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_super.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 import math from typing import List, Optional, Tuple, Union import torch import torch.nn.functional as F -import torch.utils.checkpoint from torch import nn from ...activations import ACT2FN from ...cache_utils import Cache, StaticCache from ...modeling_attn_mask_utils import AttentionMaskConverter -from ...modeling_flash_attention_utils import _flash_attention_forward -from ...modeling_outputs import ( - BaseModelOutputWithPast, -) +from ...modeling_flash_attention_utils import FlashAttentionKwargs, _flash_attention_forward +from ...modeling_outputs import BaseModelOutputWithPast from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack from ...utils import ( add_start_docstrings, add_start_docstrings_to_model_forward, @@ -33,59 +31,6 @@ logger = logging.get_logger(__name__) -def _prepare_4d_causal_attention_mask_with_cache_position( - attention_mask: torch.Tensor, - sequence_length: int, - target_length: int, - dtype: torch.dtype, - device: torch.device, - min_dtype: float, - cache_position: torch.Tensor, - batch_size: int, -): - """ - Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape - `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. - - Args: - attention_mask (`torch.Tensor`): - A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`. - sequence_length (`int`): - The sequence length being processed. - target_length (`int`): - The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet. - dtype (`torch.dtype`): - The dtype to use for the 4D attention mask. - device (`torch.device`): - The device to plcae the 4D attention mask on. - min_dtype (`float`): - The minimum value representable with the dtype `dtype`. - cache_position (`torch.Tensor`): - Indices depicting the position of the input sequence tokens in the sequence. - batch_size (`torch.Tensor`): - Batch size. - """ - if attention_mask is not None and attention_mask.dim() == 4: - # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. - causal_mask = attention_mask - else: - causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device) - if sequence_length != 1: - causal_mask = torch.triu(causal_mask, diagonal=1) - causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) - causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) - if attention_mask is not None: - causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit - mask_length = attention_mask.shape[-1] - padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] - padding_mask = padding_mask == 0 - causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( - padding_mask, min_dtype - ) - - return causal_mask - - class SuperRMSNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): """ @@ -123,7 +68,7 @@ def __init__( if config is None: logger.warning_once( "`SuperRotaryEmbedding` can now be fully parameterized by passing the model config through the " - "`config` argument. All other arguments will be removed in v4.45" + "`config` argument. All other arguments will be removed in v4.46" ) self.rope_kwargs = { "rope_type": rope_type, @@ -193,40 +138,6 @@ def forward(self, x, position_ids): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): - """Applies Rotary Position Embedding to the query and key tensors. - - Args: - q (`torch.Tensor`): The query tensor. - k (`torch.Tensor`): The key tensor. - cos (`torch.Tensor`): The cosine part of the rotary embedding. - sin (`torch.Tensor`): The sine part of the rotary embedding. - position_ids (`torch.Tensor`, *optional*): - Deprecated and unused. - unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and - sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note - that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and - k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes - cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. - Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. - """ - cos = cos.unsqueeze(unsqueeze_dim) - sin = sin.unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - class SuperMLP(nn.Module): def __init__(self, config): super().__init__() @@ -261,6 +172,40 @@ def forward(self, x): return down_proj +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: """ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, @@ -302,7 +247,7 @@ def __init__(self, config: SuperConfig, layer_idx: Optional[int] = None): self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias) - # TODO (joao): remove in v4.45 (RoPE is computed in the model, not in the decoder layers) + # TODO (joao): remove in v4.46 (RoPE is computed in the model, not in the decoder layers) self.rotary_emb = SuperRotaryEmbedding(config=self.config) def forward( @@ -314,7 +259,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() @@ -349,7 +294,7 @@ def forward( logger.warning_once( "The attention layers in this model are transitioning from computing the RoPE embeddings internally " "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " - "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " "removed and `position_embeddings` will be mandatory." ) cos, sin = self.rotary_emb(value_states, position_ids) @@ -422,7 +367,8 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 + **kwargs: Unpack[FlashAttentionKwargs], ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if isinstance(past_key_value, StaticCache): raise ValueError( @@ -449,7 +395,7 @@ def forward( logger.warning_once( "The attention layers in this model are transitioning from computing the RoPE embeddings internally " "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " - "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " "removed and `position_embeddings` will be mandatory." ) cos, sin = self.rotary_emb(value_states, position_ids) @@ -507,6 +453,7 @@ def forward( sliding_window=getattr(self, "sliding_window", None), use_top_left_mask=self._flash_attn_uses_top_left_mask, is_causal=self.is_causal, + **kwargs, ) attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() @@ -535,7 +482,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if output_attentions: @@ -569,7 +516,7 @@ def forward( logger.warning_once( "The attention layers in this model are transitioning from computing the RoPE embeddings internally " "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " - "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " "removed and `position_embeddings` will be mandatory." ) cos, sin = self.rotary_emb(value_states, position_ids) @@ -644,7 +591,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 **kwargs, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -790,7 +737,8 @@ def _init_weights(self, module): returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. Two formats are allowed: - - a [`~cache_utils.Cache`] instance; + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format. @@ -916,10 +864,9 @@ def _update_causal_mask( return None dtype, device = input_tensor.dtype, input_tensor.device - min_dtype = torch.finfo(dtype).min sequence_length = input_tensor.shape[1] if using_static_cache: - target_length = past_key_values.get_max_length() + target_length = past_key_values.get_max_cache_shape() else: target_length = ( attention_mask.shape[-1] @@ -928,13 +875,12 @@ def _update_causal_mask( ) # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). - causal_mask = _prepare_4d_causal_attention_mask_with_cache_position( + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( attention_mask, sequence_length=sequence_length, target_length=target_length, dtype=dtype, device=device, - min_dtype=min_dtype, cache_position=cache_position, batch_size=input_tensor.shape[0], ) @@ -948,6 +894,63 @@ def _update_causal_mask( # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) return causal_mask + + @staticmethod + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask diff --git a/examples/modular-transformers/modular_roberta.py b/examples/modular-transformers/modular_roberta.py index a3e0218f9320..13dca4845c13 100644 --- a/examples/modular-transformers/modular_roberta.py +++ b/examples/modular-transformers/modular_roberta.py @@ -13,8 +13,5 @@ def __init__(self, config): class RobertaModel(BertModel): - def __init__(self, config): + def __init__(self, config, add_pooling_layer=True): super().__init__(self, config) - # Error out here. Why? Because `RobertaEmbeddings` is defined but not used. - # no, because it's defined, and RobertaModel should use RobertaEmbedding - # here if initialized that way it won't use the new embedding. diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt index a54f3d03cab2..6d42c3256a83 100644 --- a/examples/research_projects/decision_transformer/requirements.txt +++ b/examples/research_projects/decision_transformer/requirements.txt @@ -233,7 +233,7 @@ urllib3==1.26.19 wasabi==0.9.0 wcwidth==0.2.5 websocket-client==1.3.1 -Werkzeug==3.0.3 +Werkzeug==3.0.6 wrapt==1.14.0 xxhash==3.0.0 yarl==1.7.2 diff --git a/i18n/README_ko.md b/i18n/README_ko.md index e2a9b80d0d3e..b9502db5dda8 100644 --- a/i18n/README_ko.md +++ b/i18n/README_ko.md @@ -15,10 +15,15 @@ limitations under the License. -->

-
- -
+ + + + Hugging Face Transformers Library + +
+

+

Build GitHub @@ -45,19 +50,25 @@ limitations under the License. Tiếng Việt | العربية | اردو | -

-

Jax, Pytorch, TensorFlow를 위한 최첨단 자연어처리

+

Jax, Pytorch, TensorFlow를 위한 최첨단 머신러닝

-🤗 Transformers는 분류, 정보 추출, 질문 답변, 요약, 번역, 문장 생성 등을 100개 이상의 언어로 수행할 수 있는 수천개의 사전학습된 모델을 제공합니다. 우리의 목표는 모두가 최첨단의 NLP 기술을 쉽게 사용하는 것입니다. +🤗 Transformers는 텍스트, 비전, 오디오와 같은 다양한 분야에서 여러 과제를 수행하는 수천 개의 사전 학습된 모델을 제공합니다. + +제공되는 모델을 통해 다음 과제를 수행할 수 있습니다. +- 📝 텍스트: 100개 이상의 언어들로, 텍스트 분류, 정보 추출, 질문 답변, 요약, 번역 및 문장 생성 +- 🖼️ 이미지: 이미지 분류(Image Classification), 객체 탐지(Object Detection) 및 분할(Segmentation) +- 🗣️ 오디오: 음성 인식(Speech Recognition) 및 오디오 분류(Audio Classification) + +Transformer의 모델은 표를 통한 질의응답(Table QA), 광학 문자 인식(Optical Character Recognition), 스캔 한 문서에서 정보 추출, 비디오 분류 및 시각적 질의응답과 같은 **여러 분야가 결합된** 과제 또한 수행할 수 있습니다. 🤗 Transformers는 이러한 사전학습 모델을 빠르게 다운로드해 특정 텍스트에 사용하고, 원하는 데이터로 fine-tuning해 커뮤니티나 우리의 [모델 허브](https://huggingface.co/models)에 공유할 수 있도록 API를 제공합니다. 또한, 모델 구조를 정의하는 각 파이썬 모듈은 완전히 독립적이여서 연구 실험을 위해 손쉽게 수정할 수 있습니다. @@ -65,9 +76,11 @@ limitations under the License. ## 온라인 데모 -대부분의 모델을 [모델 허브](https://huggingface.co/models) 페이지에서 바로 테스트해볼 수 있습니다. 공개 및 비공개 모델을 위한 [비공개 모델 호스팅, 버전 관리, 추론 API](https://huggingface.co/pricing)도 제공합니다. +대부분의 모델을 [모델 허브](https://huggingface.co/models) 페이지에서 바로 테스트해 볼 수 있습니다. 공개 및 비공개 모델을 위한 [비공개 모델 호스팅, 버전 관리, 추론 API](https://huggingface.co/pricing)도 제공합니다. -예시: +아래 몇 가지 예시가 있습니다: + +자연어 처리: - [BERT로 마스킹된 단어 완성하기](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France) - [Electra를 이용한 개체명 인식](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city) - [GPT-2로 텍스트 생성하기](https://huggingface.co/openai-community/gpt2?text=A+long+time+ago%2C+) @@ -76,45 +89,100 @@ limitations under the License. - [DistilBERT를 이용한 질문 답변](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species) - [T5로 번역하기](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin) +컴퓨터 비전: +- [ViT와 함께하는 이미지 분류](https://huggingface.co/google/vit-base-patch16-224) +- [DETR로 객체 탐지하기](https://huggingface.co/facebook/detr-resnet-50) +- [SegFormer로 의미적 분할(semantic segmentation)하기](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512) +- [Mask2Former로 판옵틱 분할(panoptic segmentation)하기](https://huggingface.co/facebook/mask2former-swin-large-coco-panoptic) +- [Depth Anything으로 깊이 추정(depth estimation)하기](https://huggingface.co/docs/transformers/main/model_doc/depth_anything) +- [VideoMAE와 함께하는 비디오 분류](https://huggingface.co/docs/transformers/model_doc/videomae) +- [OneFormer로 유니버설 분할(universal segmentation)하기](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large) + +오디오: +- [Whisper와 함께하는 자동 음성 인식](https://huggingface.co/openai/whisper-large-v3) +- [Wav2Vec2로 키워드 검출(keyword spotting)하기](https://huggingface.co/superb/wav2vec2-base-superb-ks) +- [Audio Spectrogram Transformer로 오디오 분류하기](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593) + +멀티 모달(Multimodal Task): +- [TAPAS로 표 안에서 질문 답변하기](https://huggingface.co/google/tapas-base-finetuned-wtq) +- [ViLT와 함께하는 시각적 질의응답](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa) +- [LLaVa로 이미지에 설명 넣기](https://huggingface.co/llava-hf/llava-1.5-7b-hf) +- [SigLIP와 함께하는 제로 샷(zero-shot) 이미지 분류](https://huggingface.co/google/siglip-so400m-patch14-384) +- [LayoutLM으로 문서 안에서 질문 답변하기](https://huggingface.co/impira/layoutlm-document-qa) +- [X-CLIP과 함께하는 제로 샷(zero-shot) 비디오 분류](https://huggingface.co/docs/transformers/model_doc/xclip) +- [OWLv2로 진행하는 제로 샷(zero-shot) 객체 탐지](https://huggingface.co/docs/transformers/en/model_doc/owlv2) +- [CLIPSeg로 진행하는 제로 샷(zero-shot) 이미지 분할](https://huggingface.co/docs/transformers/model_doc/clipseg) +- [SAM과 함께하는 자동 마스크 생성](https://huggingface.co/docs/transformers/model_doc/sam) + **[Transformer와 글쓰기](https://transformer.huggingface.co)** 는 이 저장소의 텍스트 생성 능력에 관한 Hugging Face 팀의 공식 데모입니다. -## Hugging Face 팀의 커스텀 지원을 원한다면 +## Transformers를 사용한 100개의 프로젝트 + +Transformers는 사전 학습된 모델들을 이용하는 도구를 넘어 Transformers와 함께 빌드 된 프로젝트 및 Hugging Face Hub를 위한 하나의 커뮤니티입니다. 우리는 Transformers를 통해 개발자, 연구자, 학생, 교수, 엔지니어 및 모든 이들이 꿈을 품은 프로젝트(Dream Project)를 빌드 할 수 있길 바랍니다. - - HuggingFace Expert Acceleration Program +Transformers에 달린 100,000개의 별을 축하하기 위해, 우리는 커뮤니티를 주목하고자 Transformers를 품고 빌드 된 100개의 어마어마한 프로젝트들을 선별하여 [awesome-transformers](https://github.com/huggingface/transformers/blob/main/awesome-transformers.md) 페이지에 나열하였습니다. + +만일 소유한 혹은 사용하고 계신 프로젝트가 이 리스트에 꼭 등재되어야 한다고 믿으신다면, PR을 열고 추가하여 주세요! + +## 조직 안에서 AI 사용에 대해 진지하게 고민 중이신가요? Hugging Face Enterprise Hub을 통해 더 빨리 구축해 보세요. + + + Hugging Face Enterprise Hub
## 퀵 투어 -원하는 텍스트에 바로 모델을 사용할 수 있도록, 우리는 `pipeline` API를 제공합니다. Pipeline은 사전학습 모델과 그 모델을 학습할 때 적용한 전처리 방식을 하나로 합칩니다. 다음은 긍정적인 텍스트와 부정적인 텍스트를 분류하기 위해 pipeline을 사용한 간단한 예시입니다: +주어진 입력(텍스트, 이미지, 오디오, ...)에 바로 모델을 사용할 수 있도록, 우리는 `pipeline` API를 제공합니다. Pipeline은 사전학습 모델과 그 모델을 학습할 때 적용한 전처리 방식을 하나로 합칩니다. 다음은 긍정적인 텍스트와 부정적인 텍스트를 분류하기 위해 pipeline을 사용한 간단한 예시입니다: ```python >>> from transformers import pipeline -# Allocate a pipeline for sentiment-analysis +# 감정 분석 파이프라인을 할당하세요 >>> classifier = pipeline('sentiment-analysis') >>> classifier('We are very happy to introduce pipeline to the transformers repository.') [{'label': 'POSITIVE', 'score': 0.9996980428695679}] ``` -코드의 두번째 줄은 pipeline이 사용하는 사전학습 모델을 다운로드하고 캐시로 저장합니다. 세번째 줄에선 그 모델이 주어진 텍스트를 평가합니다. 여기서 모델은 99.97%의 확률로 텍스트가 긍정적이라고 평가했습니다. +코드의 두 번째 줄은 pipeline이 사용하는 사전학습 모델을 다운로드하고 캐시로 저장합니다. 세 번째 줄에선 그 모델이 주어진 텍스트를 평가합니다. 여기서 모델은 99.97%의 확률로 텍스트가 긍정적이라고 평가했습니다. -많은 NLP 과제들을 `pipeline`으로 바로 수행할 수 있습니다. 예를 들어, 질문과 문맥이 주어지면 손쉽게 답변을 추출할 수 있습니다: +자연어 처리(NLP) 뿐만 아니라 컴퓨터 비전, 발화(Speech) 과제들을 사전 학습된 `pipeline`으로 바로 수행할 수 있습니다. 예를 들어, 사진에서 손쉽게 객체들을 탐지할 수 있습니다.: ``` python +>>> import requests +>>> from PIL import Image >>> from transformers import pipeline -# Allocate a pipeline for question-answering ->>> question_answerer = pipeline('question-answering') ->>> question_answerer({ -... 'question': 'What is the name of the repository ?', -... 'context': 'Pipeline has been included in the huggingface/transformers repository' -... }) -{'score': 0.30970096588134766, 'start': 34, 'end': 58, 'answer': 'huggingface/transformers'} - +# 귀여운 고양이가 있는 이미지를 다운로드하세요 +>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" +>>> image_data = requests.get(url, stream=True).raw +>>> image = Image.open(image_data) + +# 객체 감지를 위한 파이프라인을 할당하세요 +>>> object_detector = pipeline('object-detection') +>>> object_detector(image) +[{'score': 0.9982201457023621, + 'label': 'remote', + 'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}}, + {'score': 0.9960021376609802, + 'label': 'remote', + 'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}}, + {'score': 0.9954745173454285, + 'label': 'couch', + 'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}}, + {'score': 0.9988006353378296, + 'label': 'cat', + 'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}}, + {'score': 0.9986783862113953, + 'label': 'cat', + 'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}] ``` +위와 같이, 우리는 이미지에서 탐지된 객체들에 대하여 객체를 감싸는 박스와 확률 리스트를 얻을 수 있습니다. 왼쪽이 원본 이미지이며 오른쪽은 해당 이미지에 탐지된 결과를 표시하였습니다. +

+ + +

-답변뿐만 아니라, 여기에 사용된 사전학습 모델은 확신도와 토크나이즈된 문장 속 답변의 시작점, 끝점까지 반환합니다. [이 튜토리얼](https://huggingface.co/docs/transformers/task_summary)에서 `pipeline` API가 지원하는 다양한 과제를 확인할 수 있습니다. +[이 튜토리얼](https://huggingface.co/docs/transformers/ko/task_summary)에서 `pipeline` API가 지원하는 다양한 과제를 확인할 수 있습니다. 코드 3줄로 원하는 과제에 맞게 사전학습 모델을 다운로드 받고 사용할 수 있습니다. 다음은 PyTorch 버전입니다: ```python @@ -139,24 +207,24 @@ limitations under the License. 토크나이저는 사전학습 모델의 모든 전처리를 책임집니다. 그리고 (위의 예시처럼) 1개의 스트링이나 리스트도 처리할 수 있습니다. 토크나이저는 딕셔너리를 반환하는데, 이는 다운스트림 코드에 사용하거나 언패킹 연산자 ** 를 이용해 모델에 바로 전달할 수도 있습니다. -모델 자체는 일반적으로 사용되는 [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)나 [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)입니다. [이 튜토리얼](https://huggingface.co/transformers/training.html)은 이러한 모델을 표준적인 PyTorch나 TensorFlow 학습 과정에서 사용하는 방법, 또는 새로운 데이터로 fine-tune하기 위해 `Trainer` API를 사용하는 방법을 설명해줍니다. +모델 자체는 일반적으로 사용되는 [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)이나 [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)입니다. [이 튜토리얼](https://huggingface.co/docs/transformers/ko/training)은 이러한 모델을 표준적인 PyTorch나 TensorFlow 학습 과정에서 사용하는 방법, 또는 새로운 데이터로 파인 튜닝하기 위해 `Trainer` API를 사용하는 방법을 설명해 줍니다. ## 왜 transformers를 사용해야 할까요? 1. 손쉽게 사용할 수 있는 최첨단 모델: - - NLU와 NLG 과제에서 뛰어난 성능을 보입니다. - - 교육자 실무자에게 진입 장벽이 낮습니다. + - 자연어 이해(NLU)와 생성(NLG), 컴퓨터 비전, 오디오 과제에서 뛰어난 성능을 보입니다. + - 교육자와 실무자에게 진입 장벽이 낮습니다. - 3개의 클래스만 배우면 바로 사용할 수 있습니다. - 하나의 API로 모든 사전학습 모델을 사용할 수 있습니다. 1. 더 적은 계산 비용, 더 적은 탄소 발자국: - 연구자들은 모델을 계속 다시 학습시키는 대신 학습된 모델을 공유할 수 있습니다. - 실무자들은 학습에 필요한 시간과 비용을 절약할 수 있습니다. - - 수십개의 모델 구조, 2,000개 이상의 사전학습 모델, 100개 이상의 언어로 학습된 모델 등. + - 모든 분야를 통틀어서 400,000개 이상의 사전 학습된 모델이 있는 수십 개의 아키텍처. 1. 모델의 각 생애주기에 적합한 프레임워크: - 코드 3줄로 최첨단 모델을 학습하세요. - - 자유롭게 모델을 TF2.0나 PyTorch 프레임워크로 변환하세요. + - 목적에 알맞게 모델을 TF2.0/Pytorch/Jax 프레임 워크 중 하나로 이동시키세요. - 학습, 평가, 공개 등 각 단계에 맞는 프레임워크를 원하는대로 선택하세요. 1. 필요한 대로 모델이나 예시를 커스터마이즈하세요: @@ -167,14 +235,14 @@ limitations under the License. ## 왜 transformers를 사용하지 말아야 할까요? - 이 라이브러리는 신경망 블록을 만들기 위한 모듈이 아닙니다. 연구자들이 여러 파일을 살펴보지 않고 바로 각 모델을 사용할 수 있도록, 모델 파일 코드의 추상화 수준을 적정하게 유지했습니다. -- 학습 API는 모든 모델에 적용할 수 있도록 만들어지진 않았지만, 라이브러리가 제공하는 모델들에 적용할 수 있도록 최적화되었습니다. 일반적인 머신 러닝을 위해선, 다른 라이브러리를 사용하세요. +- 학습 API는 모든 모델에 적용할 수 있도록 만들어지진 않았지만, 라이브러리가 제공하는 모델들에 적용할 수 있도록 최적화되었습니다. 일반적인 머신 러닝을 위해선, 다른 라이브러리를 사용하세요(예를 들면, [Accelerate](https://huggingface.co/docs/accelerate/index)). - 가능한 많은 사용 예시를 보여드리고 싶어서, [예시 폴더](https://github.com/huggingface/transformers/tree/main/examples)의 스크립트를 준비했습니다. 이 스크립트들을 수정 없이 특정한 문제에 바로 적용하지 못할 수 있습니다. 필요에 맞게 일부 코드를 수정해야 할 수 있습니다. ## 설치 ### pip로 설치하기 -이 저장소는 Python 3.8+, Flax 0.4.1+, PyTorch 1.11+, TensorFlow 2.6+에서 테스트 되었습니다. +이 저장소는 Python 3.9+, Flax 0.4.1+, PyTorch 1.11+, TensorFlow 2.6+에서 테스트 되었습니다. [가상 환경](https://docs.python.org/3/library/venv.html)에 🤗 Transformers를 설치하세요. Python 가상 환경에 익숙하지 않다면, [사용자 가이드](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)를 확인하세요. @@ -189,7 +257,7 @@ limitations under the License. pip install transformers ``` -예시들을 체험해보고 싶거나, 최최최첨단 코드를 원하거나, 새로운 버전이 나올 때까지 기다릴 수 없다면 [라이브러리를 소스에서 바로 설치](https://huggingface.co/docs/transformers/installation#installing-from-source)하셔야 합니다. +예시들을 체험해보고 싶거나, 최최최첨단 코드를 원하거나, 새로운 버전이 나올 때까지 기다릴 수 없다면 [라이브러리를 소스에서 바로 설치](https://huggingface.co/docs/transformers/ko/installation#install-from-source)하셔야 합니다. ### conda로 설치하기 @@ -203,29 +271,30 @@ conda install conda-forge::transformers Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 방법을 확인하세요. +> **_노트:_** 윈도우 환경에서 캐싱의 이점을 위해 개발자 모드를 활성화할 수 있습니다. 만약 여러분에게 있어서 선택이 아닌 필수라면 [이 이슈](https://github.com/huggingface/huggingface_hub/issues/1062)를 통해 알려주세요. + ## 모델 구조 -**🤗 Transformers가 제공하는 [모든 모델 체크포인트](https://huggingface.co/models)** 는 huggingface.co [모델 허브](https://huggingface.co)에 완벽히 연동되어 있습니다. [개인](https://huggingface.co/users)과 [기관](https://huggingface.co/organizations)이 모델 허브에 직접 업로드할 수 있습니다. +**🤗 Transformers가 제공하는 [모든 모델 체크포인트](https://huggingface.co/models)** 는 huggingface.co [모델 허브](https://huggingface.co/models)에 완벽히 연동되어 있습니다. [개인](https://huggingface.co/users)과 [기관](https://huggingface.co/organizations)이 모델 허브에 직접 업로드할 수 있습니다. 현재 사용 가능한 모델 체크포인트의 개수: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen) -🤗 Transformers는 다음 모델들을 제공합니다: 각 모델의 요약은 [여기](https://huggingface.co/docs/transformers/model_summary)서 확인하세요. +🤗 Transformers는 다음 모델들을 제공합니다: 각 모델의 요약은 [여기](https://huggingface.co/docs/transformers/ko/model_summary)서 확인하세요. -각 모델이 Flax, PyTorch, TensorFlow으로 구현되었는지 또는 🤗 Tokenizers 라이브러리가 지원하는 토크나이저를 사용하는지 확인하려면, [이 표](https://huggingface.co/docs/transformers/index#supported-frameworks)를 확인하세요. +각 모델이 Flax, PyTorch, TensorFlow으로 구현되었는지 또는 🤗 Tokenizers 라이브러리가 지원하는 토크나이저를 사용하는지 확인하려면, [이 표](https://huggingface.co/docs/transformers/ko/index#supported-framework)를 확인하세요. -이 구현은 여러 데이터로 검증되었고 (예시 스크립트를 참고하세요) 오리지널 구현의 성능과 같아야 합니다. [도큐먼트](https://huggingface.co/docs/transformers/examples)의 Examples 섹션에서 성능에 대한 자세한 설명을 확인할 수 있습니다. +이 구현은 여러 데이터로 검증되었고 (예시 스크립트를 참고하세요) 오리지널 구현의 성능과 같아야 합니다. [도큐먼트](https://github.com/huggingface/transformers/tree/main/examples)의 Examples 섹션에서 성능에 대한 자세한 설명을 확인할 수 있습니다. ## 더 알아보기 | 섹션 | 설명 | |-|-| -| [도큐먼트](https://huggingface.co/transformers/) | 전체 API 도큐먼트와 튜토리얼 | -| [과제 요약](https://huggingface.co/docs/transformers/task_summary) | 🤗 Transformers가 지원하는 과제들 | -| [전처리 튜토리얼](https://huggingface.co/docs/transformers/preprocessing) | `Tokenizer` 클래스를 이용해 모델을 위한 데이터 준비하기 | -| [학습과 fine-tuning](https://huggingface.co/docs/transformers/training) | 🤗 Transformers가 제공하는 모델 PyTorch/TensorFlow 학습 과정과 `Trainer` API에서 사용하기 | -| [퀵 투어: Fine-tuning/사용 스크립트](https://github.com/huggingface/transformers/tree/main/examples) | 다양한 과제에서 모델 fine-tuning하는 예시 스크립트 | -| [모델 공유 및 업로드](https://huggingface.co/docs/transformers/model_sharing) | 커뮤니티에 fine-tune된 모델을 업로드 및 공유하기 | -| [마이그레이션](https://huggingface.co/docs/transformers/migration) | `pytorch-transformers`나 `pytorch-pretrained-bert`에서 🤗 Transformers로 이동하기| +| [도큐먼트](https://huggingface.co/transformers/ko/) | 전체 API 도큐먼트와 튜토리얼 | +| [과제 요약](https://huggingface.co/docs/transformers/ko/task_summary) | 🤗 Transformers가 지원하는 과제들 | +| [전처리 튜토리얼](https://huggingface.co/docs/transformers/ko/preprocessing) | `Tokenizer` 클래스를 이용해 모델을 위한 데이터 준비하기 | +| [학습과 파인 튜닝](https://huggingface.co/docs/transformers/ko/training) | 🤗 Transformers가 제공하는 모델 PyTorch/TensorFlow 학습 과정과 `Trainer` API에서 사용하기 | +| [퀵 투어: 파인 튜닝/사용 스크립트](https://github.com/huggingface/transformers/tree/main/examples) | 다양한 과제에서 모델을 파인 튜닝하는 예시 스크립트 | +| [모델 공유 및 업로드](https://huggingface.co/docs/transformers/ko/model_sharing) | 커뮤니티에 파인 튜닝된 모델을 업로드 및 공유하기 | ## 인용 diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index cc8b07395024..47b43e0b9089 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -868,6 +868,7 @@ "ImageClassificationPipeline", "ImageFeatureExtractionPipeline", "ImageSegmentationPipeline", + "ImageTextToTextPipeline", "ImageToImagePipeline", "ImageToTextPipeline", "JsonPipelineDataFormat", @@ -1228,7 +1229,7 @@ _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"]) _import_structure["models.pvt"].extend(["PvtImageProcessor"]) _import_structure["models.qwen2_vl"].extend(["Qwen2VLImageProcessor"]) - _import_structure["models.rt_detr"].extend(["RTDetrImageProcessor"]) + _import_structure["models.rt_detr"].extend(["RTDetrImageProcessor", "RTDetrImageProcessorFast"]) _import_structure["models.sam"].extend(["SamImageProcessor"]) _import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"]) _import_structure["models.seggpt"].extend(["SegGptImageProcessor"]) @@ -5794,6 +5795,7 @@ ImageClassificationPipeline, ImageFeatureExtractionPipeline, ImageSegmentationPipeline, + ImageTextToTextPipeline, ImageToImagePipeline, ImageToTextPipeline, JsonPipelineDataFormat, @@ -6152,7 +6154,7 @@ ) from .models.pvt import PvtImageProcessor from .models.qwen2_vl import Qwen2VLImageProcessor - from .models.rt_detr import RTDetrImageProcessor + from .models.rt_detr import RTDetrImageProcessor, RTDetrImageProcessorFast from .models.sam import SamImageProcessor from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor from .models.seggpt import SegGptImageProcessor diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index cc80f6a19bfb..9e75e6fd3c38 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -443,7 +443,7 @@ def _torch_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] return torch.stack(examples, dim=0) # If yes, check if we have a `pad_token`. - if tokenizer._pad_token is None: + if tokenizer.pad_token is None: raise ValueError( "You are attempting to pad samples but the tokenizer you are using" f" ({tokenizer.__class__.__name__}) does not have a pad token." @@ -477,7 +477,7 @@ def _tf_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = N return tf.stack(examples, axis=0) # If yes, check if we have a `pad_token`. - if tokenizer._pad_token is None: + if tokenizer.pad_token is None: raise ValueError( "You are attempting to pad samples but the tokenizer you are using" f" ({tokenizer.__class__.__name__}) does not have a pad token." @@ -513,7 +513,7 @@ def _numpy_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] return np.stack(examples, axis=0) # If yes, check if we have a `pad_token`. - if tokenizer._pad_token is None: + if tokenizer.pad_token is None: raise ValueError( "You are attempting to pad samples but the tokenizer you are using" f" ({tokenizer.__class__.__name__}) does not have a pad token." @@ -1090,7 +1090,7 @@ def torch_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]: self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) - if self.tokenizer._pad_token is not None: + if self.tokenizer.pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) @@ -1131,7 +1131,7 @@ def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]: self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels ] masked_indices = masked_indices & ~tf.cast(special_tokens_mask, dtype=tf.bool) - if self.tokenizer._pad_token is not None: + if self.tokenizer.pad_token is not None: padding_mask = inputs == self.tokenizer.pad_token_id masked_indices = masked_indices & ~padding_mask @@ -1170,7 +1170,7 @@ def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]: self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] masked_indices[np.array(special_tokens_mask, dtype=bool)] = 0 - if self.tokenizer._pad_token is not None: + if self.tokenizer.pad_token is not None: padding_mask = labels == self.tokenizer.pad_token_id masked_indices[padding_mask] = 0 @@ -1251,13 +1251,13 @@ def mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any]: self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) - if self.tokenizer._pad_token is not None: + if self.tokenizer.pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() # probability be `1` (masked), however in albert model attention mask `0` means masked, revert the value attention_mask = (~masked_indices).float() - if self.tokenizer._pad_token is not None: + if self.tokenizer.pad_token is not None: attention_padding_mask = labels.eq(self.tokenizer.pad_token_id) attention_mask.masked_fill_(attention_padding_mask, value=1.0) labels[~masked_indices] = -100 # We only compute loss on masked tokens, -100 is default for CE compute @@ -1367,7 +1367,7 @@ def torch_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]: dtype=torch.bool, ) masked_indices.masked_fill_(special_tokens_mask, value=0.0) - if self.tokenizer._pad_token is not None: + if self.tokenizer.pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) masked_indices.masked_fill_(padding_mask, value=0.0) @@ -1471,7 +1471,7 @@ def tf_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]: ) special_tokens_mask = tf.cast(special_tokens_mask, dtype=tf.bool) masked_indices = masked_indices & ~special_tokens_mask - if self.tokenizer._pad_token is not None: + if self.tokenizer.pad_token is not None: padding_mask = labels == self.tokenizer.pad_token_id masked_indices = masked_indices & ~padding_mask @@ -1571,7 +1571,7 @@ def numpy_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]: dtype=bool, ) masked_indices[special_tokens_mask] = 0 - if self.tokenizer._pad_token is not None: + if self.tokenizer.pad_token is not None: padding_mask = labels == self.tokenizer.pad_token_id masked_indices[padding_mask] = 0.0 diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py index 4e0e1dd34302..bf44d4b427cf 100644 --- a/src/transformers/dynamic_module_utils.py +++ b/src/transformers/dynamic_module_utils.py @@ -152,7 +152,8 @@ def get_imports(filename: Union[str, os.PathLike]) -> List[str]: content = f.read() # filter out try/except block so in custom code we can have try/except imports - content = re.sub(r"\s*try\s*:\s*.*?\s*except\s*.*?:", "", content, flags=re.MULTILINE | re.DOTALL) + content = re.sub(r"\s*try\s*:.*?except.*?:", "", content, flags=re.DOTALL) + # filter out imports under is_flash_attn_2_available block for avoid import issues in cpu only environment content = re.sub( r"if is_flash_attn[a-zA-Z0-9_]+available\(\):\s*(from flash_attn\s*.*\s*)+", "", content, flags=re.MULTILINE diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py index 08480ac983e8..88535b44e9c4 100644 --- a/src/transformers/generation/flax_utils.py +++ b/src/transformers/generation/flax_utils.py @@ -397,6 +397,8 @@ def generate( "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)" ) generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length + else: # by default let's always generate 10 new tokens + generation_config.max_length = generation_config.max_length + input_ids_seq_length if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length: raise ValueError( diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index efe953db051c..53cd2df3a49c 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -378,10 +378,14 @@ def prepare_inputs_for_generation( # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens # Exception 1: when passing input_embeds, input_ids may be missing entries # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case + # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case. + # (we can't check exception 3 while compiling) if past_key_values is not None: model_inputs["past_key_values"] = past_key_values - if inputs_embeds is not None or cache_position[-1] >= input_ids.shape[1]: # Exception 1 or Exception 3 + if ( + inputs_embeds is not None # Exception 1 + or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1]) # Exception 3 + ): input_ids = input_ids[:, -cache_position.shape[0] :] elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) input_ids = input_ids[:, cache_position] @@ -414,7 +418,7 @@ def prepare_inputs_for_generation( for model_input_name in ["position_ids", "token_type_ids"]: model_input = kwargs.get(model_input_name) if model_input is not None: - if past_key_values: + if past_key_values is not None: model_input = model_input[:, -input_ids.shape[1] :] model_input = model_input.clone(memory_format=torch.contiguous_format) model_inputs[model_input_name] = model_input @@ -568,27 +572,34 @@ def _maybe_initialize_input_ids_for_generation( def _prepare_attention_mask_for_generation( self, - inputs: torch.Tensor, - pad_token_id: Optional[torch.Tensor], - eos_token_id: Optional[torch.Tensor], + inputs_tensor: torch.Tensor, + generation_config: GenerationConfig, + model_kwargs: Dict[str, Any], ) -> torch.LongTensor: + pad_token_id = generation_config._pad_token_tensor + eos_token_id = generation_config._eos_token_tensor + + # `input_ids` may be present in the model kwargs, instead of being the main input (e.g. multimodal model) + if "input_ids" in model_kwargs and model_kwargs["input_ids"].shape[1] > 0: + inputs_tensor = model_kwargs["input_ids"] + # No information for attention mask inference -> return default attention mask - default_attention_mask = torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device) + default_attention_mask = torch.ones(inputs_tensor.shape[:2], dtype=torch.long, device=inputs_tensor.device) if pad_token_id is None: return default_attention_mask - is_input_ids = len(inputs.shape) == 2 and inputs.dtype in [torch.int, torch.long] + is_input_ids = len(inputs_tensor.shape) == 2 and inputs_tensor.dtype in [torch.int, torch.long] if not is_input_ids: return default_attention_mask is_pad_token_in_inputs = (pad_token_id is not None) and ( - isin_mps_friendly(elements=inputs, test_elements=pad_token_id).any() + isin_mps_friendly(elements=inputs_tensor, test_elements=pad_token_id).any() ) is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ~( isin_mps_friendly(elements=eos_token_id, test_elements=pad_token_id).any() ) can_infer_attention_mask = is_pad_token_in_inputs * is_pad_token_not_equal_to_eos_token_id - attention_mask_from_padding = inputs.ne(pad_token_id).long() + attention_mask_from_padding = inputs_tensor.ne(pad_token_id).long() attention_mask = ( attention_mask_from_padding * can_infer_attention_mask + default_attention_mask * ~can_infer_attention_mask @@ -1441,10 +1452,11 @@ def _prepare_generated_length( ): generation_config.max_length -= inputs_tensor.shape[1] elif has_default_max_length: # by default let's always generate 20 new tokens - generation_config.max_length = generation_config.max_length + input_ids_length - max_position_embeddings = getattr(self.config, "max_position_embeddings", None) - if max_position_embeddings is not None: - generation_config.max_length = min(generation_config.max_length, max_position_embeddings) + if generation_config.max_length == GenerationConfig().max_length: + generation_config.max_length = generation_config.max_length + input_ids_length + max_position_embeddings = getattr(self.config, "max_position_embeddings", None) + if max_position_embeddings is not None: + generation_config.max_length = min(generation_config.max_length, max_position_embeddings) # same for min length if generation_config.min_new_tokens is not None: @@ -2020,7 +2032,7 @@ def generate( if not kwargs_has_attention_mask and requires_attention_mask and accepts_attention_mask: model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( - inputs_tensor, generation_config._pad_token_tensor, generation_config._eos_token_tensor + inputs_tensor, generation_config, model_kwargs ) elif kwargs_has_attention_mask: # TODO (joao): generalize this check with other types of inputs diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py index d1a08132d73d..3c1be325b7eb 100644 --- a/src/transformers/image_processing_utils_fast.py +++ b/src/transformers/image_processing_utils_fast.py @@ -15,14 +15,18 @@ import functools from dataclasses import dataclass +from typing import Any, Iterable, List, Optional, Tuple from .image_processing_utils import BaseImageProcessor -from .utils.import_utils import is_torchvision_available +from .utils.import_utils import is_torch_available, is_torchvision_available if is_torchvision_available(): from torchvision.transforms import Compose +if is_torch_available(): + import torch + @dataclass(frozen=True) class SizeDict: @@ -66,3 +70,64 @@ def to_dict(self): encoder_dict = super().to_dict() encoder_dict.pop("_transform_params", None) return encoder_dict + + +def get_image_size_for_max_height_width( + image_size: Tuple[int, int], + max_height: int, + max_width: int, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + image_size (`Tuple[int, int]`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + """ + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + +def safe_squeeze(tensor: "torch.Tensor", axis: Optional[int] = None) -> "torch.Tensor": + """ + Squeezes a tensor, but only if the axis specified has dim 1. + """ + if axis is None: + return tensor.squeeze() + + try: + return tensor.squeeze(axis=axis) + except ValueError: + return tensor + + +def max_across_indices(values: Iterable[Any]) -> List[Any]: + """ + Return the maximum value across all indices of an iterable of values. + """ + return [max(values_i) for values_i in zip(*values)] + + +def get_max_height_width(images: List["torch.Tensor"]) -> Tuple[int]: + """ + Get the maximum height and width across all images in a batch. + """ + + _, max_height, max_width = max_across_indices([img.shape for img in images]) + + return (max_height, max_width) diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index 1a70ef056383..f59b99b490d3 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -385,6 +385,27 @@ def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] = return image +def load_images( + images: Union[List, Tuple, str, "PIL.Image.Image"], timeout: Optional[float] = None +) -> Union["PIL.Image.Image", List["PIL.Image.Image"], List[List["PIL.Image.Image"]]]: + """Loads images, handling different levels of nesting. + + Args: + images: A single image, a list of images, or a list of lists of images to load. + timeout: Timeout for loading images. + + Returns: + A single image, a list of images, a list of lists of images. + """ + if isinstance(images, (list, tuple)): + if len(images) and isinstance(images[0], (list, tuple)): + return [[load_image(image, timeout=timeout) for image in image_group] for image_group in images] + else: + return [load_image(image, timeout=timeout) for image in images] + else: + return load_image(images, timeout=timeout) + + def validate_preprocess_arguments( do_rescale: Optional[bool] = None, rescale_factor: Optional[float] = None, diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index 4a2740fcb30e..f4545f2698c0 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -235,6 +235,19 @@ "output.weight": "lm_head.weight", "output_norm": "model.norm", }, + "mamba": { + "token_embd": "backbone.embeddings", + "blk": "backbone.layers", + "ssm_a": "mixer.A_log", + "ssm_conv1d": "mixer.conv1d", + "ssm_in": "mixer.in_proj", + "ssm_out": "mixer.out_proj", + "ssm_x": "mixer.x_proj", + "ssm_dt": "mixer.dt_proj", + "attn_norm": "norm", + "output_norm": "backbone.norm_f", + "output.weight": "lm_head.weight", + }, } @@ -373,6 +386,17 @@ "attention.head_count_kv": "num_key_value_heads", "attention.layer_norm_epsilon": "norm_epsilon", }, + "mamba": { + "vocab_size": "vocab_size", + "context_length": "max_position_embeddings", + "embedding_length": "hidden_size", + "attention.layer_norm_rms_epsilon": "layer_norm_epsilon", + "block_count": "num_hidden_layers", + "ssm.conv_kernel": "conv_kernel", + "ssm.state_size": "state_size", + "ssm.time_step_rank": "time_step_rank", + "ssm.inner_size": "intermediate_size", + }, } GGUF_TOKENIZER_MAPPING = { @@ -768,6 +792,7 @@ def converted(self) -> Tokenizer: "gpt2": GGUFGPTConverter, "starcoder2": GGUFGPTConverter, "t5": GGUFT5Converter, + "mamba": GGUFGPTConverter, } diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index a09116552c8e..be9a4aff3c7e 100755 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -697,6 +697,8 @@ def on_log(self, args, state, control, logs=None, **kwargs): for k, v in logs.items(): if isinstance(v, (int, float)): self.tb_writer.add_scalar(k, v, state.global_step) + elif isinstance(v, str): + self.tb_writer.add_text(k, v, state.global_step) else: logger.warning( "Trainer is attempting to log a value of " diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py index 171b2f4d15b1..c784ca0eb4ca 100644 --- a/src/transformers/modeling_gguf_pytorch_utils.py +++ b/src/transformers/modeling_gguf_pytorch_utils.py @@ -220,6 +220,19 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): name = "lm_head.weight" parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights)) continue + if architecture == "mamba": + if "ssm_d" in name and "bias" not in name and "weight" not in name: + # ssm_d has conflicts with ssm_dt in name checking + # we have to explicitly check that name is exactly ssm_d + name = name.replace("ssm_d", "mixer.D") + if "ssm_conv1d.weight" in name: + # for compatibility tensor ssm_conv1d must be (5120, 1, 4]) dim, + # quantized one is (5120, 4) + weights = np.expand_dims(weights, axis=1) + if "ssm_a" in name: + # Original exponential implementation + # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L2975-L2977 + weights = np.log(-weights) for tensor_name in tensor_key_mapping: if tensor_name.format(bid=bid) in name: diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index a6fbd7b1a914..2ef4c3615c9f 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -28,7 +28,7 @@ import warnings from contextlib import contextmanager from dataclasses import dataclass -from functools import lru_cache, partial, wraps +from functools import partial, wraps from threading import Thread from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union from zipfile import is_zipfile @@ -943,13 +943,14 @@ def _load_state_dict_into_meta_model( old_param = model splits = param_name.split(".") for split in splits: - old_param = getattr(old_param, split) - # Not all the attributes of a module are Parameters/Tensor - if not isinstance(old_param, (torch.nn.Parameter, torch.Tensor)): - old_param = None + # We shouldn't hit the default value unless for quant methods like hqq that modifies expected_keys. + old_param = getattr(old_param, split, None) if old_param is None: break + if not isinstance(old_param, (torch.nn.Parameter, torch.Tensor)): + old_param = None + if old_param is not None: if dtype is None: param = param.to(old_param.dtype) @@ -5013,7 +5014,6 @@ def _is_quantized_training_enabled(self): return self.hf_quantizer.is_trainable @property - @lru_cache def loss_function(self): if getattr(self.config, "loss_type", None) is not None: loss_type = self.config.loss_type diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index d181afeb2d4d..a8960d80acc8 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -114,6 +114,7 @@ ("oneformer", ("OneFormerImageProcessor",)), ("owlv2", ("Owlv2ImageProcessor",)), ("owlvit", ("OwlViTImageProcessor",)), + ("paligemma", ("SiglipImageProcessor",)), ("perceiver", ("PerceiverImageProcessor",)), ("pix2struct", ("Pix2StructImageProcessor",)), ("pixtral", ("PixtralImageProcessor",)), @@ -123,7 +124,7 @@ ("qwen2_vl", ("Qwen2VLImageProcessor",)), ("regnet", ("ConvNextImageProcessor",)), ("resnet", ("ConvNextImageProcessor",)), - ("rt_detr", "RTDetrImageProcessor"), + ("rt_detr", ("RTDetrImageProcessor", "RTDetrImageProcessorFast")), ("sam", ("SamImageProcessor",)), ("segformer", ("SegformerImageProcessor",)), ("seggpt", ("SegGptImageProcessor",)), diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py index e7df05785886..b623d2a8adb1 100644 --- a/src/transformers/models/blip/modeling_blip.py +++ b/src/transformers/models/blip/modeling_blip.py @@ -795,6 +795,12 @@ def __init__(self, config: BlipConfig): # Initialize weights and apply final processing self.post_init() + def get_input_embeddings(self): + return self.text_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.text_model.set_input_embeddings(value) + @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING) def get_text_features( self, @@ -1053,8 +1059,11 @@ def __init__(self, config: BlipConfig): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self) -> nn.Module: - return self.vision_model.embeddings.patch_embedding + def get_input_embeddings(self): + return self.text_decoder.get_input_embeddings() + + def set_input_embeddings(self, value): + self.text_decoder.set_input_embeddings(value) @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BlipForConditionalGenerationModelOutput, config_class=BlipVisionConfig) @@ -1117,7 +1126,8 @@ def forward( ) if not return_dict: - outputs = (outputs[0], outputs[1], image_embeds, vision_outputs[0]) + vision_outputs[2:] + outputs = (outputs[0], outputs[1]) if labels is not None else (outputs[0],) + outputs += (image_embeds, vision_outputs[0]) + vision_outputs[2:] return tuple(output for output in outputs if output is not None) return BlipForConditionalGenerationModelOutput( @@ -1232,8 +1242,12 @@ def __init__(self, config: BlipConfig): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self) -> nn.Module: - return self.vision_model.embeddings.patch_embedding + def set_input_embeddings(self, value): + self.text_encoder.set_input_embeddings(value) + + def get_input_embeddings(self): + # This will return shared embeddings if they are shared else specific to encoder. + return self.text_encoder.get_input_embeddings() @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BlipTextVisionModelOutput, config_class=BlipVisionConfig) @@ -1474,8 +1488,11 @@ def __init__(self, config: BlipConfig): # Initialize weights and apply final processing self.post_init() - def get_input_embeddings(self) -> nn.Module: - return self.vision_model.embeddings.patch_embedding + def get_input_embeddings(self): + return self.text_encoder.get_input_embeddings() + + def set_input_embeddings(self, value): + self.text_encoder.set_input_embeddings(value) @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BlipTextVisionModelOutput, config_class=BlipVisionConfig) diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py index 5ee7ae21f9d5..97a4f523380b 100644 --- a/src/transformers/models/blip/modeling_blip_text.py +++ b/src/transformers/models/blip/modeling_blip_text.py @@ -817,6 +817,12 @@ def __init__(self, config): self.cls = BlipTextOnlyMLMHead(config) self.label_smoothing = config.label_smoothing + def get_input_embeddings(self): + return self.bert.get_input_embeddings() + + def set_input_embeddings(self, new_embeddings): + self.bert.set_input_embeddings(new_embeddings) + def get_output_embeddings(self): return self.cls.predictions.decoder diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index eba82cd1b3c8..08e42d1c8f70 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -1768,11 +1768,12 @@ def forward( decoder_attention_mask=decoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, + return_dict=True, # toggle for easier access to loss/logits below labels=labels, ) - loss = outputs.loss if return_dict else outputs[0] - logits = outputs.logits if return_dict else outputs[1] + loss = outputs.loss + logits = outputs.logits + outputs = outputs.to_tuple() if not return_dict else outputs if not return_dict: output = (logits, vision_outputs, query_outputs, outputs) @@ -1810,6 +1811,12 @@ def __init__(self, config: Blip2Config): # Initialize weights and apply final processing self.post_init() + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + @add_start_docstrings_to_model_forward(BLIP_2_TEXT_WITH_PROJECTION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Blip2TextModelOutput, config_class=Blip2Config) def forward( @@ -2233,11 +2240,12 @@ def forward( decoder_attention_mask=decoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, + return_dict=True, # toggle for easier access to loss/logits below labels=labels, ) - loss = outputs.loss if return_dict else outputs[0] - logits = outputs.logits if return_dict else outputs[1] + loss = outputs.loss + logits = outputs.logits + outputs = outputs.to_tuple() if not return_dict else outputs if not return_dict: output = (logits, vision_outputs, query_outputs, outputs) @@ -2334,24 +2342,11 @@ def generate( ) generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] - outputs = self.language_model.generate( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - **generate_kwargs, - ) - - # this is a temporary workaround to be consistent with other generation models and - # have BOS as the first token, even though under the hood we are calling LM with embeds + inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask} if not self.language_model.config.is_encoder_decoder: - bos_tokens = ( - torch.LongTensor([[self.config.text_config.bos_token_id]]) - .repeat(batch_size, 1) - .to(image_embeds.device) - ) - if not isinstance(outputs, torch.Tensor): - outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1) - else: - outputs = torch.cat([bos_tokens, outputs], dim=-1) + inputs["input_ids"] = input_ids + + outputs = self.language_model.generate(**inputs, **generate_kwargs) return outputs @@ -2389,6 +2384,12 @@ def __init__(self, config: Blip2Config): # Initialize weights and apply final processing self.post_init() + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + @add_start_docstrings_to_model_forward(BLIP2_IMAGE_TEXT_RETRIEVAL_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Blip2ImageTextMatchingModelOutput, config_class=Blip2Config) def forward( diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py index fa6a99f71a46..c68523784128 100644 --- a/src/transformers/models/blip_2/processing_blip_2.py +++ b/src/transformers/models/blip_2/processing_blip_2.py @@ -74,8 +74,11 @@ class Blip2Processor(ProcessorMixin): def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs): tokenizer.return_token_type_ids = False self.current_processor = image_processor - self.image_token = AddedToken("", normalized=False, special=True) - tokenizer.add_tokens([self.image_token], special_tokens=True) + if not hasattr(tokenizer, "image_token"): + self.image_token = AddedToken("", normalized=False, special=True) + tokenizer.add_tokens([self.image_token], special_tokens=True) + else: + self.image_token = tokenizer.image_token self.num_query_tokens = num_query_tokens super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index 797908277930..0661da872799 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -1288,7 +1288,7 @@ def forward( if pixel_values is not None: image_tokens = self.get_image_tokens(pixel_values) n_image_tokens_in_text = (input_ids == self.vocabulary_mapping.image_token_id).sum().item() - n_image_features = image_tokens.shape[0] + n_image_features = image_tokens.shape[0] * image_tokens.shape[1] if n_image_tokens_in_text != n_image_features: raise ValueError( f"Image features and image tokens do not match: tokens: {n_image_tokens_in_text}, features {n_image_features}" diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py index 2d699c8f663a..e2a50d1af51b 100644 --- a/src/transformers/models/chameleon/processing_chameleon.py +++ b/src/transformers/models/chameleon/processing_chameleon.py @@ -66,9 +66,12 @@ class ChameleonProcessor(ProcessorMixin): def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = ""): self.image_seq_length = image_seq_length - self.image_token = image_token - self.image_start_token = "" # fixed tokens for start and end, so can hardcode - self.image_end_token = "" + self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token + self.image_start_token = ( + tokenizer.boi_token if hasattr(tokenizer, "boi_token") else "" + ) # fixed tokens for start and end, so can hardcode + self.image_end_token = tokenizer.eoi_token if hasattr(tokenizer, "eoi_token") else "" + super().__init__(image_processor, tokenizer) def __call__( diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index d0224e3caa5b..f422b17b204f 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -575,7 +575,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # Copied from transformers.models.swin.modeling_swin.SwinLayer with SwinDropPath->ClapDropPath, Swin->ClapAudio class ClapAudioLayer(nn.Module): - def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): + def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0): super().__init__() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.shift_size = shift_size @@ -583,7 +583,7 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): self.input_resolution = input_resolution self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.attention = ClapAudioAttention(config, dim, num_heads, window_size=self.window_size) - self.drop_path = ClapDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + self.drop_path = ClapDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.intermediate = ClapAudioIntermediate(config, dim) self.output = ClapAudioOutput(config, dim) @@ -712,6 +712,7 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d dim=dim, input_resolution=input_resolution, num_heads=num_heads, + drop_path_rate=drop_path[i], shift_size=0 if (i % 2 == 0) else config.window_size // 2, ) for i in range(depth) diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index 8ff7f1cd96a0..4ead68032b60 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -205,7 +205,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: return torch.cat((class_pos_embed, patch_pos_embed), dim=1) - def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=True) -> torch.Tensor: batch_size, _, height, width = pixel_values.shape if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): raise ValueError( @@ -535,7 +535,7 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + interpolate_pos_encoding (`bool`, *optional*, defaults to `True`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. @@ -574,7 +574,7 @@ def _init_weights(self, module): output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. - interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + interpolate_pos_encoding (`bool`, *optional*, defaults to `True`): Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. @@ -845,14 +845,13 @@ def __init__(self, config: CLIPSegVisionConfig): @add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegVisionConfig) - # Copied from transformers.models.clip.modeling_clip.CLIPVisionTransformer.forward def forward( self, - pixel_values: Optional[torch.FloatTensor] = None, + pixel_values: Optional[torch.FloatTensor], output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - interpolate_pos_encoding: Optional[bool] = False, + interpolate_pos_encoding: Optional[bool] = True, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: @@ -864,9 +863,6 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if pixel_values is None: - raise ValueError("You have to specify pixel_values") - hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) @@ -912,7 +908,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - interpolate_pos_encoding: Optional[bool] = False, + interpolate_pos_encoding: Optional[bool] = True, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" @@ -1035,7 +1031,7 @@ def get_image_features( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - interpolate_pos_encoding: bool = False, + interpolate_pos_encoding: bool = True, return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" @@ -1091,7 +1087,7 @@ def forward( return_loss: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - interpolate_pos_encoding: bool = False, + interpolate_pos_encoding: bool = True, return_dict: Optional[bool] = None, ) -> Union[Tuple, CLIPSegOutput]: r""" @@ -1397,7 +1393,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - interpolate_pos_encoding: bool = False, + interpolate_pos_encoding: bool = True, return_dict: Optional[bool] = None, ) -> Union[Tuple, CLIPSegOutput]: r""" diff --git a/src/transformers/models/detr/image_processing_detr_fast.py b/src/transformers/models/detr/image_processing_detr_fast.py index 0fa1d0ffd9db..eadde59e55e4 100644 --- a/src/transformers/models/detr/image_processing_detr_fast.py +++ b/src/transformers/models/detr/image_processing_detr_fast.py @@ -21,7 +21,13 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union from ...image_processing_utils import BatchFeature, get_size_dict -from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + SizeDict, + get_image_size_for_max_height_width, + get_max_height_width, + safe_squeeze, +) from ...image_transforms import ( center_to_corners_format, corners_to_center_format, @@ -55,7 +61,6 @@ compute_segments, convert_segmentation_to_rle, get_size_with_aspect_ratio, - max_across_indices, remove_low_and_no_objects, ) @@ -85,60 +90,6 @@ SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) -def get_image_size_for_max_height_width( - image_size: Tuple[int, int], - max_height: int, - max_width: int, -) -> Tuple[int, int]: - """ - Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. - Important, even if image_height < max_height and image_width < max_width, the image will be resized - to at least one of the edges be equal to max_height or max_width. - - For example: - - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) - - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) - - Args: - image_size (`Tuple[int, int]`): - The image to resize. - max_height (`int`): - The maximum allowed height. - max_width (`int`): - The maximum allowed width. - """ - height, width = image_size - height_scale = max_height / height - width_scale = max_width / width - min_scale = min(height_scale, width_scale) - new_height = int(height * min_scale) - new_width = int(width * min_scale) - return new_height, new_width - - -def safe_squeeze(tensor: torch.Tensor, axis: Optional[int] = None) -> torch.Tensor: - """ - Squeezes a tensor, but only if the axis specified has dim 1. - """ - if axis is None: - return tensor.squeeze() - - try: - return tensor.squeeze(axis=axis) - except ValueError: - return tensor - - -def get_max_height_width(images: List[torch.Tensor]) -> Tuple[int]: - """ - Get the maximum height and width across all images in a batch. - """ - - _, max_height, max_width = max_across_indices([img.shape for img in images]) - - return (max_height, max_width) - - # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33 def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor: """ @@ -191,18 +142,21 @@ def prepare_coco_detection_annotation( # Get all COCO annotations for the given image. annotations = target["annotations"] - annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0] + classes = [] + area = [] + boxes = [] + keypoints = [] + for obj in annotations: + if "iscrowd" not in obj or obj["iscrowd"] == 0: + classes.append(obj["category_id"]) + area.append(obj["area"]) + boxes.append(obj["bbox"]) + if "keypoints" in obj: + keypoints.append(obj["keypoints"]) - classes = [obj["category_id"] for obj in annotations] classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device) - - # for conversion to coco api - area = torch.as_tensor([obj["area"] for obj in annotations], dtype=torch.float32, device=image.device) - iscrowd = torch.as_tensor( - [obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=torch.int64, device=image.device - ) - - boxes = [obj["bbox"] for obj in annotations] + area = torch.as_tensor(area, dtype=torch.float32, device=image.device) + iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device) # guard against no boxes via resizing boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4) boxes[:, 2:] += boxes[:, :2] @@ -211,19 +165,16 @@ def prepare_coco_detection_annotation( keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) - new_target = {} - new_target["image_id"] = image_id - new_target["class_labels"] = classes[keep] - new_target["boxes"] = boxes[keep] - new_target["area"] = area[keep] - new_target["iscrowd"] = iscrowd[keep] - new_target["orig_size"] = torch.as_tensor( - [int(image_height), int(image_width)], dtype=torch.int64, device=image.device - ) + new_target = { + "image_id": image_id, + "class_labels": classes[keep], + "boxes": boxes[keep], + "area": area[keep], + "iscrowd": iscrowd[keep], + "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device), + } - if annotations and "keypoints" in annotations[0]: - keypoints = [obj["keypoints"] for obj in annotations] - # Converting the filtered keypoints list to a numpy array + if keypoints: keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device) # Apply the keep mask here to filter the relevant annotations keypoints = keypoints[keep] @@ -911,84 +862,81 @@ def preprocess( if input_data_format == ChannelDimension.LAST: images = [image.permute(2, 0, 1).contiguous() for image in images] - # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) - if annotations is not None: - prepared_images = [] - prepared_annotations = [] - for image, target in zip(images, annotations): - target = self.prepare_annotation( + if do_rescale and do_normalize: + # fused rescale and normalize + new_mean = torch.tensor(image_mean, device=images[0].device) * (1.0 / rescale_factor) + new_std = torch.tensor(image_std, device=images[0].device) * (1.0 / rescale_factor) + + processed_images = [] + processed_annotations = [] + pixel_masks = [] # Initialize pixel_masks here + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + annotation = self.prepare_annotation( image, - target, + annotation, format, return_segmentation_masks=return_segmentation_masks, masks_path=masks_path, input_data_format=input_data_format, ) - prepared_images.append(image) - prepared_annotations.append(target) - images = prepared_images - annotations = prepared_annotations - del prepared_images, prepared_annotations - - if do_resize: - if isinstance(resample, (PILImageResampling, int)): - interpolation = pil_torch_interpolation_mapping[resample] - else: - interpolation = resample - resized_images = [self.resize(image, size=size, interpolation=interpolation) for image in images] - if annotations is not None: - for i, (image, target) in enumerate(zip(resized_images, annotations)): - annotations[i] = self.resize_annotation( - target, - orig_size=images[i].size()[-2:], - target_size=image.size()[-2:], + + if do_resize: + interpolation = ( + pil_torch_interpolation_mapping[resample] + if isinstance(resample, (PILImageResampling, int)) + else resample + ) + resized_image = self.resize(image, size=size, interpolation=interpolation) + if annotations is not None: + annotation = self.resize_annotation( + annotation, + orig_size=image.size()[-2:], + target_size=resized_image.size()[-2:], ) - images = resized_images - del resized_images + image = resized_image - if do_rescale and do_normalize: - # fused rescale and normalize - new_mean = torch.tensor(image_mean, device=images[0].device) * (1.0 / rescale_factor) - new_std = torch.tensor(image_std, device=images[0].device) * (1.0 / rescale_factor) - images = [F.normalize(image.to(dtype=torch.float32), new_mean, new_std) for image in images] - elif do_rescale: - images = [image * rescale_factor for image in images] - elif do_normalize: - images = [F.normalize(image, image_mean, image_std) for image in images] - - if do_convert_annotations and annotations is not None: - annotations = [ - self.normalize_annotation(annotation, get_image_size(image, input_data_format)) - for annotation, image in zip(annotations, images) - ] + if do_rescale and do_normalize: + # fused rescale and normalize + image = F.normalize(image.to(dtype=torch.float32), new_mean, new_std) + elif do_rescale: + image = image * rescale_factor + elif do_normalize: + image = F.normalize(image, image_mean, image_std) + + if do_convert_annotations and annotations is not None: + annotation = self.normalize_annotation(annotation, get_image_size(image, input_data_format)) + + processed_images.append(image) + processed_annotations.append(annotation) + images = processed_images + annotations = processed_annotations if annotations is not None else None if do_pad: - # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + # depends on all resized image shapes so we need another loop if pad_size is not None: padded_size = (pad_size["height"], pad_size["width"]) else: padded_size = get_max_height_width(images) - annotation_list = annotations if annotations is not None else [None] * len(images) padded_images = [] - pixel_masks = [] padded_annotations = [] - for image, annotation in zip(images, annotation_list): + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} if padded_size == image.size()[-2:]: padded_images.append(image) pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device)) padded_annotations.append(annotation) continue - padded_image, pixel_mask, padded_annotation = self.pad( + image, pixel_mask, annotation = self.pad( image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations ) - padded_images.append(padded_image) + padded_images.append(image) + padded_annotations.append(annotation) pixel_masks.append(pixel_mask) - padded_annotations.append(padded_annotation) images = padded_images - if annotations is not None: - annotations = padded_annotations - del padded_images, padded_annotations + annotations = padded_annotations if annotations is not None else None data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)}) data.update({"pixel_values": torch.stack(images, dim=0)}) diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py index 8d639131b841..2d5272e8642e 100644 --- a/src/transformers/models/donut/modeling_donut_swin.py +++ b/src/transformers/models/donut/modeling_donut_swin.py @@ -558,7 +558,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # Copied from transformers.models.swin.modeling_swin.SwinLayer with Swin->DonutSwin class DonutSwinLayer(nn.Module): - def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): + def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0): super().__init__() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.shift_size = shift_size @@ -566,7 +566,7 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): self.input_resolution = input_resolution self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.attention = DonutSwinAttention(config, dim, num_heads, window_size=self.window_size) - self.drop_path = DonutSwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + self.drop_path = DonutSwinDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.intermediate = DonutSwinIntermediate(config, dim) self.output = DonutSwinOutput(config, dim) @@ -695,6 +695,7 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d dim=dim, input_resolution=input_resolution, num_heads=num_heads, + drop_path_rate=drop_path[i], shift_size=0 if (i % 2 == 0) else config.window_size // 2, ) for i in range(depth) diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py index 9552d323ac57..b46ff4bcfab9 100644 --- a/src/transformers/models/donut/processing_donut.py +++ b/src/transformers/models/donut/processing_donut.py @@ -24,12 +24,16 @@ from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import logging class DonutProcessorKwargs(ProcessingKwargs, total=False): _defaults = {} +logger = logging.get_logger(__name__) + + class DonutProcessor(ProcessorMixin): r""" Constructs a Donut processor which wraps a Donut image processor and an XLMRoBERTa tokenizer into a single @@ -85,6 +89,16 @@ def __call__( [`~DonutTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information. """ # For backward compatibility + legacy = kwargs.pop("legacy", True) + if legacy: + # With `add_special_tokens=True`, the performance of donut are degraded when working with both images and text. + logger.warning_once( + "Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. " + "In the new behavior, if both images and text are provided, the default value of `add_special_tokens` " + "will be changed to `False` when calling the tokenizer if `add_special_tokens` is unset. " + "To test the new behavior, set `legacy=False`as a processor call argument." + ) + if self._in_target_context_manager: return self.current_processor(images, text, **kwargs) @@ -100,6 +114,8 @@ def __call__( if images is not None: inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) if text is not None: + if not legacy and images is not None: + output_kwargs["text_kwargs"].setdefault("add_special_tokens", False) encodings = self.tokenizer(text, **output_kwargs["text_kwargs"]) if text is None: diff --git a/src/transformers/models/fuyu/image_processing_fuyu.py b/src/transformers/models/fuyu/image_processing_fuyu.py index 255922b83088..4bb9ea7964d4 100644 --- a/src/transformers/models/fuyu/image_processing_fuyu.py +++ b/src/transformers/models/fuyu/image_processing_fuyu.py @@ -19,7 +19,7 @@ import numpy as np -from ...image_processing_utils import BaseImageProcessor, BatchFeature +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from ...image_transforms import ( pad, resize, @@ -475,6 +475,7 @@ def preprocess( input_data_format = infer_channel_dimension_format(batch_images[0][0]) original_image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images] + size = get_size_dict(size) # for BC if do_resize: batch_images = [ diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py index ff7d2c547dc4..e24f2fd4d1ab 100644 --- a/src/transformers/models/fuyu/processing_fuyu.py +++ b/src/transformers/models/fuyu/processing_fuyu.py @@ -264,10 +264,10 @@ def _tokenize_prompts_with_image_and_batch( bos_token = tokenizer.vocab["|ENDOFTEXT|"] prompts_tokens = [[[bos_token] + x for x in prompt_seq] for prompt_seq in prompts_tokens] if add_beginning_of_answer_token: - boa = tokenizer.vocab[BEGINNING_OF_ANSWER_STRING] + beginning_of_answer = tokenizer.vocab[BEGINNING_OF_ANSWER_STRING] # Only add bbox open token to the last subsequence since that is what will be completed for token_seq in prompts_tokens: - token_seq[-1].append(boa) + token_seq[-1].append(beginning_of_answer) # Now we have a list of list of tokens which each list has a different # size. We want to extend this list to: @@ -682,6 +682,32 @@ def tokens_to_points(tokens, original_size): return results + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-processes the output of `FuyuForConditionalGeneration` to only return the text output. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + containing the token ids of the generated sequences. + + Returns: + `List[str]`: The decoded text output. + """ + beginning_of_answer = self.tokenizer.convert_tokens_to_ids(BEGINNING_OF_ANSWER_STRING) + # get boa index for each outputted sequence tensor + # start all generated sequences from the beginning of the answer token, pad to have consistent length + unpadded_output_sequences = [ + seq[(seq == beginning_of_answer).nonzero(as_tuple=True)[0] + 1 :] for seq in generated_outputs + ] + max_len = max(len(seq) for seq in unpadded_output_sequences) + # convert to torch and pad sequences + padded_output_sequences = torch.full((len(unpadded_output_sequences), max_len), self.pad_token_id) + for i, seq in enumerate(unpadded_output_sequences): + padded_output_sequences[i, : len(seq)] = torch.tensor(seq) + + return self.batch_decode(padded_output_sequences, skip_special_tokens=True) + def batch_decode(self, *args, **kwargs): """ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 9a4de1022c57..fa3fadc4349a 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -23,7 +23,6 @@ from typing import List, Optional, Tuple, Union import torch -import torch.utils.checkpoint from torch import nn from ...activations import ACT2FN @@ -49,7 +48,10 @@ from .configuration_gemma import GemmaConfig +logger = logging.get_logger(__name__) + _CHECKPOINT_FOR_DOC = "google/gemma-7b" +_CONFIG_FOR_DOC = "GemmaConfig" class GemmaRMSNorm(nn.Module): @@ -72,9 +74,6 @@ def extra_repr(self): return f"{tuple(self.weight.shape)}, eps={self.eps}" -logger = logging.get_logger(__name__) - - class GemmaRotaryEmbedding(nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): super().__init__() @@ -624,9 +623,6 @@ def _init_weights(self, module): module.weight.data[module.padding_idx].zero_() -_CONFIG_FOR_DOC = "GemmaConfig" - - GEMMA_INPUTS_DOCSTRING = r""" Args: input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): diff --git a/src/transformers/models/gemma/tokenization_gemma.py b/src/transformers/models/gemma/tokenization_gemma.py index ff0d1d034c22..7138cafbd625 100644 --- a/src/transformers/models/gemma/tokenization_gemma.py +++ b/src/transformers/models/gemma/tokenization_gemma.py @@ -138,7 +138,7 @@ def __getstate__(self): return state def __setstate__(self, d): - self.__dict__ = d + self.__dict__.update(d) self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.LoadFromSerializedProto(self.sp_model_proto) diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py index 74976bdd340f..eb562b3a6893 100644 --- a/src/transformers/models/gemma2/configuration_gemma2.py +++ b/src/transformers/models/gemma2/configuration_gemma2.py @@ -19,8 +19,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - from ...configuration_utils import PretrainedConfig @@ -36,15 +34,15 @@ class Gemma2Config(PretrainedConfig): vocab_size (`int`, *optional*, defaults to 256000): Vocabulary size of the Gemma2 model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`Gemma2Model`] - hidden_size (`int`, *optional*, defaults to 3072): + hidden_size (`int`, *optional*, defaults to 2304): Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 24576): + intermediate_size (`int`, *optional*, defaults to 9216): Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 28): + num_hidden_layers (`int`, *optional*, defaults to 26): Number of hidden layers in the Transformer decoder. - num_attention_heads (`int`, *optional*, defaults to 16): + num_attention_heads (`int`, *optional*, defaults to 8): Number of attention heads for each attention layer in the Transformer decoder. - num_key_value_heads (`int`, *optional*, defaults to 16): + num_key_value_heads (`int`, *optional*, defaults to 4): This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When @@ -80,7 +78,7 @@ class Gemma2Config(PretrainedConfig): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - query_pre_attn_scalar (`float`, *optional*, defaults to 224): scaling factor used on the attention scores + query_pre_attn_scalar (`float`, *optional*, defaults to 256): scaling factor used on the attention scores sliding_window (`int`, *optional*, defaults to 4096): in Gemma2, every other layer uses sliding window attention. This is the size of the sliding window. final_logit_softcapping (`float`, *optional*, defaults to 30.0): scaling factor when applying tanh softcapping on the logits. @@ -103,11 +101,11 @@ class Gemma2Config(PretrainedConfig): def __init__( self, vocab_size=256000, - hidden_size=3072, - intermediate_size=24576, - num_hidden_layers=28, - num_attention_heads=16, - num_key_value_heads=16, + hidden_size=2304, + intermediate_size=9216, + num_hidden_layers=26, + num_attention_heads=8, + num_key_value_heads=4, head_dim=256, hidden_activation="gelu_pytorch_tanh", max_position_embeddings=8192, @@ -121,7 +119,7 @@ def __init__( rope_theta=10000.0, attention_bias=False, attention_dropout=0.0, - query_pre_attn_scalar=224, + query_pre_attn_scalar=256, sliding_window=4096, final_logit_softcapping=30.0, attn_logit_softcapping=50.0, diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index 6d61c47619f3..626e5537fc06 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -23,7 +23,6 @@ import torch import torch.nn as nn -import torch.utils.checkpoint from ...activations import ACT2FN from ...cache_utils import Cache, HybridCache @@ -40,6 +39,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + is_flash_attn_2_available, is_flash_attn_greater_or_equal, is_flash_attn_greater_or_equal_2_10, logging, @@ -48,7 +48,15 @@ from .configuration_gemma2 import Gemma2Config +if is_flash_attn_2_available(): + from ...modeling_flash_attention_utils import _flash_attention_forward + + +logger = logging.get_logger(__name__) + + _CHECKPOINT_FOR_DOC = "google/gemma2-7b" +_CONFIG_FOR_DOC = "Gemma2Config" class Gemma2RMSNorm(nn.Module): @@ -86,9 +94,6 @@ def forward(self, x): return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) -logger = logging.get_logger(__name__) - - class Gemma2RotaryEmbedding(nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): super().__init__() @@ -198,12 +203,12 @@ def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None): self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias) - self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None self.rotary_emb = Gemma2RotaryEmbedding( self.head_dim, max_position_embeddings=self.max_position_embeddings, base=self.rope_theta, ) + self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None def forward( self, @@ -495,12 +500,12 @@ def __init__(self, config: Gemma2Config, layer_idx: int): self.self_attn = GEMMA2_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx) self.mlp = Gemma2MLP(config) self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.config = config self.is_sliding = not bool(layer_idx % 2) self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.sliding_window = config.sliding_window - self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) def forward( self, @@ -638,9 +643,6 @@ def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False): return config -_CONFIG_FOR_DOC = "Gemma2Config" - - GEMMA2_INPUTS_DOCSTRING = r""" Args: input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): @@ -865,6 +867,7 @@ def forward( attentions=all_self_attns, ) + @torch.no_grad() def _update_causal_mask( self, attention_mask: torch.Tensor, diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index 7ddb1c9f4c99..dacaca1c7ef4 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -67,15 +67,15 @@ class Gemma2Config(PretrainedConfig): vocab_size (`int`, *optional*, defaults to 256000): Vocabulary size of the Gemma2 model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`Gemma2Model`] - hidden_size (`int`, *optional*, defaults to 3072): + hidden_size (`int`, *optional*, defaults to 2304): Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 24576): + intermediate_size (`int`, *optional*, defaults to 9216): Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 28): + num_hidden_layers (`int`, *optional*, defaults to 26): Number of hidden layers in the Transformer decoder. - num_attention_heads (`int`, *optional*, defaults to 16): + num_attention_heads (`int`, *optional*, defaults to 8): Number of attention heads for each attention layer in the Transformer decoder. - num_key_value_heads (`int`, *optional*, defaults to 16): + num_key_value_heads (`int`, *optional*, defaults to 4): This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When @@ -111,7 +111,7 @@ class Gemma2Config(PretrainedConfig): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - query_pre_attn_scalar (`float`, *optional*, defaults to 224): scaling factor used on the attention scores + query_pre_attn_scalar (`float`, *optional*, defaults to 256): scaling factor used on the attention scores sliding_window (`int`, *optional*, defaults to 4096): in Gemma2, every other layer uses sliding window attention. This is the size of the sliding window. final_logit_softcapping (`float`, *optional*, defaults to 30.0): scaling factor when applying tanh softcapping on the logits. @@ -134,11 +134,11 @@ class Gemma2Config(PretrainedConfig): def __init__( self, vocab_size=256000, - hidden_size=3072, - intermediate_size=24576, - num_hidden_layers=28, - num_attention_heads=16, - num_key_value_heads=16, + hidden_size=2304, + intermediate_size=9216, + num_hidden_layers=26, + num_attention_heads=8, + num_key_value_heads=4, head_dim=256, hidden_activation="gelu_pytorch_tanh", max_position_embeddings=8192, @@ -152,7 +152,7 @@ def __init__( rope_theta=10000.0, attention_bias=False, attention_dropout=0.0, - query_pre_attn_scalar=224, + query_pre_attn_scalar=256, sliding_window=4096, final_logit_softcapping=30.0, attn_logit_softcapping=50.0, diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py index 3744d81a0aca..e9e96fa765d8 100644 --- a/src/transformers/models/git/processing_git.py +++ b/src/transformers/models/git/processing_git.py @@ -22,12 +22,16 @@ from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import logging class GitProcessorKwargs(ProcessingKwargs, total=False): _defaults = {} +logger = logging.get_logger(__name__) + + class GitProcessor(ProcessorMixin): r""" Constructs a GIT processor which wraps a CLIP image processor and a BERT tokenizer into a single processor. @@ -91,6 +95,15 @@ def __call__( `None`). - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ + legacy = kwargs.pop("legacy", True) + if legacy: + logger.warning_once( + "Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. " + "In the new behavior, if both images and text are provided, the last token (EOS token) " + "of the input_ids and attention_mask tensors will be removed. " + "To test the new behavior, set `legacy=False`as a processor call argument." + ) + if text is None and images is None: raise ValueError("You have to specify either text or images. Both cannot be none.") @@ -110,6 +123,10 @@ def __call__( if images is not None: image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) data.update(image_features) + if not legacy: + data["input_ids"] = data["input_ids"][:, :-1] + data["attention_mask"] = data["attention_mask"][:, :-1] + return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) def batch_decode(self, *args, **kwargs): diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index 5f8eaf89ed93..248ec4021791 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -24,7 +24,6 @@ import torch import torch.nn as nn -import torch.utils.checkpoint from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, StaticCache @@ -50,7 +49,10 @@ from .configuration_glm import GlmConfig +logger = logging.get_logger(__name__) + _CHECKPOINT_FOR_DOC = "THUDM/glm-4-9b" +_CONFIG_FOR_DOC = "GlmConfig" class GlmRMSNorm(nn.Module): @@ -121,7 +123,16 @@ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: return self.down_proj(up_states) -logger = logging.get_logger(__name__) +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) def rotate_half(x): @@ -172,18 +183,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): return q_embed, k_embed -def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, - num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - class GlmAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" @@ -608,9 +607,6 @@ def _init_weights(self, module): module.weight.data[module.padding_idx].zero_() -_CONFIG_FOR_DOC = "GlmConfig" - - GLM_INPUTS_DOCSTRING = r""" Args: input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py index 3406ab2226e0..ca6e4702d317 100644 --- a/src/transformers/models/idefics/processing_idefics.py +++ b/src/transformers/models/idefics/processing_idefics.py @@ -219,7 +219,11 @@ def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_u super().__init__(image_processor, tokenizer) self.current_processor = self.image_processor - self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN) + self.image_token_id = ( + tokenizer.image_token_id + if hasattr(tokenizer, "image_token") + else tokenizer.convert_tokens_to_ids(IMAGE_TOKEN) + ) self.default_image_dims = ( self.image_processor.image_num_channels, diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index 9a041257c36b..f99c1bda4745 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -95,16 +95,19 @@ def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 64, cha if tokenizer is None: raise ValueError("You need to specify a `tokenizer`.") - self.fake_image_token = AddedToken("", normalized=False, special=True) - self.image_token = AddedToken("", normalized=False, special=True) + if not hasattr(tokenizer, "image_token"): + self.fake_image_token = AddedToken("", normalized=False, special=True) + self.image_token = AddedToken("", normalized=False, special=True) + tokens_to_add = {"additional_special_tokens": [self.fake_image_token, self.image_token]} + tokenizer.add_special_tokens(tokens_to_add) + else: + self.fake_image_token = tokenizer.image_boundary_token + self.image_token = tokenizer.image_token + self.end_of_utterance_token = AddedToken("", normalized=False, special=True) + tokenizer.add_special_tokens({"additional_special_tokens": [self.end_of_utterance_token]}) self.image_seq_len = image_seq_len - tokens_to_add = { - "additional_special_tokens": [self.fake_image_token, self.image_token, self.end_of_utterance_token] - } - tokenizer.add_special_tokens(tokens_to_add) - super().__init__(image_processor, tokenizer, chat_template=chat_template) def _extract_images_from_prompts(self, prompts): diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index 5cce774ce071..a78a3b668774 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -1625,27 +1625,10 @@ def generate( ) generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] - outputs = self.language_model.generate( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - **generate_kwargs, - ) - - # this is a temporary workaround to be consistent with other generation models and - # have BOS as the first token, even though under the hood we are calling LM with embeds + inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask} if not self.language_model.config.is_encoder_decoder: - # the InstructBLIP authors used inconsistent tokenizer/model files during training, - # with the tokenizer's bos token being set to which has ID=2, - # whereas the model's text config has bos token id = 0 - bos_token_id = ( - 2 - if self.config.text_config.architectures[0] == "LLaMAForCausalLM" - else self.config.text_config.bos_token_id - ) - bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device) - if not isinstance(outputs, torch.Tensor): - outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1) - else: - outputs = torch.cat([bos_tokens, outputs], dim=-1) + inputs["input_ids"] = input_ids + + outputs = self.language_model.generate(**inputs, **generate_kwargs) return outputs diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py index 05ff9871f4d7..3d48839d376c 100644 --- a/src/transformers/models/instructblip/processing_instructblip.py +++ b/src/transformers/models/instructblip/processing_instructblip.py @@ -78,8 +78,11 @@ class InstructBlipProcessor(ProcessorMixin): qformer_tokenizer_class = "AutoTokenizer" def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs): - self.image_token = AddedToken("", normalized=False, special=True) - tokenizer.add_tokens([self.image_token], special_tokens=True) + if not hasattr(tokenizer, "image_token"): + self.image_token = AddedToken("", normalized=False, special=True) + tokenizer.add_tokens([self.image_token], special_tokens=True) + else: + self.image_token = tokenizer.image_token self.num_query_tokens = num_query_tokens super().__init__(image_processor, tokenizer, qformer_tokenizer) diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py index c9f12391666c..b0a494dcfe6c 100644 --- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py @@ -24,7 +24,6 @@ from typing import Any, Optional, Tuple, Union import torch -import torch.utils.checkpoint from torch import nn from torch.nn import CrossEntropyLoss @@ -347,104 +346,6 @@ def _init_weights(self, module): module.bias.data.zero_() -INSTRUCTBLIPVIDEO_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`InstructBlipVideoConfig`]): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - -INSTRUCTBLIPVIDEO_VISION_INPUTS_DOCSTRING = r""" - Args: - pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Pixel values. Pixel values can be obtained using [`InstructBlipVideoProcessor`]. See - [`InstructBlipVideoProcessor.__call__`] for details. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): - Whether to interpolate the pre-trained position encodings. -""" - -INSTRUCTBLIPVIDEO_INPUTS_DOCSTRING = r""" - Args: - pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Pixel values. Pixel values can be obtained using [`InstructBlipVideoProcessor`]. See - [`InstructBlipVideoProcessor.__call__`] for details. - - qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided - to serve as text prompt, which the Q-Former model will encode. - - Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for - details. - - [What are input IDs?](../glossary#input-ids) - - qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be - provided to serve as text prompt, which the language model can continue. - - Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for - details. - - [What are input IDs?](../glossary#input-ids) - - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): - Indices of decoder input sequence tokens in the vocabulary of the language model. Only relevant in case an - encoder-decoder language model (like T5) is used. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. [What are decoder input IDs?](../glossary#decoder-input-ids) - - decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): - Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also - be used by default. - - Only relevant in case an encoder-decoder language model (like T5) is used. - - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): - Whether to interpolate the pre-trained position encodings. -""" - - class InstructBlipVideoEncoder(nn.Module): """ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a @@ -531,6 +432,24 @@ def forward( ) +INSTRUCTBLIPVIDEO_VISION_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`InstructBlipVideoProcessor`]. See + [`InstructBlipVideoProcessor.__call__`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + Whether to interpolate the pre-trained position encodings. +""" + + class InstructBlipVideoVisionModel(InstructBlipVideoPreTrainedModel): main_input_name = "pixel_values" config_class = InstructBlipVideoVisionConfig @@ -1268,6 +1187,87 @@ def forward( ) +INSTRUCTBLIPVIDEO_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`InstructBlipVideoConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +INSTRUCTBLIPVIDEO_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`InstructBlipVideoProcessor`]. See + [`InstructBlipVideoProcessor.__call__`] for details. + + qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided + to serve as text prompt, which the Q-Former model will encode. + + Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for + details. + + [What are input IDs?](../glossary#input-ids) + + qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be + provided to serve as text prompt, which the language model can continue. + + Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for + details. + + [What are input IDs?](../glossary#input-ids) + + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary of the language model. Only relevant in case an + encoder-decoder language model (like T5) is used. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. [What are decoder input IDs?](../glossary#decoder-input-ids) + + decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + Only relevant in case an encoder-decoder language model (like T5) is used. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + Whether to interpolate the pre-trained position encodings. +""" + + @add_start_docstrings( """ InstructBlipVideo Model for generating text given an image and an optional text prompt. The model consists of a vision @@ -1660,27 +1660,10 @@ def generate( ) generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] - outputs = self.language_model.generate( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - **generate_kwargs, - ) - - # this is a temporary workaround to be consistent with other generation models and - # have BOS as the first token, even though under the hood we are calling LM with embeds + inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask} if not self.language_model.config.is_encoder_decoder: - # the InstructBLIP authors used inconsistent tokenizer/model files during training, - # with the tokenizer's bos token being set to which has ID=2, - # whereas the model's text config has bos token id = 0 - bos_token_id = ( - 2 - if self.config.text_config.architectures[0] == "LLaMAForCausalLM" - else self.config.text_config.bos_token_id - ) - bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device) - if not isinstance(outputs, torch.Tensor): - outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1) - else: - outputs = torch.cat([bos_tokens, outputs], dim=-1) + inputs["input_ids"] = input_ids + + outputs = self.language_model.generate(**inputs, **generate_kwargs) return outputs diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py index 2128f25df662..63c6c486854c 100644 --- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py @@ -468,27 +468,10 @@ def generate( ) generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] - outputs = self.language_model.generate( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - **generate_kwargs, - ) - - # this is a temporary workaround to be consistent with other generation models and - # have BOS as the first token, even though under the hood we are calling LM with embeds + inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask} if not self.language_model.config.is_encoder_decoder: - # the InstructBLIP authors used inconsistent tokenizer/model files during training, - # with the tokenizer's bos token being set to which has ID=2, - # whereas the model's text config has bos token id = 0 - bos_token_id = ( - 2 - if self.config.text_config.architectures[0] == "LLaMAForCausalLM" - else self.config.text_config.bos_token_id - ) - bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device) - if not isinstance(outputs, torch.Tensor): - outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1) - else: - outputs = torch.cat([bos_tokens, outputs], dim=-1) + inputs["input_ids"] = input_ids + + outputs = self.language_model.generate(**inputs, **generate_kwargs) return outputs diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py index 3e96d279a42f..1d4e59e26b46 100644 --- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py @@ -63,8 +63,11 @@ class InstructBlipVideoProcessor(ProcessorMixin): qformer_tokenizer_class = "AutoTokenizer" def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs): - self.video_token = AddedToken("