diff --git a/examples/idefics/README.md b/examples/idefics/README.md new file mode 100644 index 00000000..9cc36164 --- /dev/null +++ b/examples/idefics/README.md @@ -0,0 +1,12 @@ +# IDEFICS Demos/examples + +## Inference +- [Normal inference](inference.py) (needs ~20GB GPU memory) +- [4bit quantized inference](inference_4bit.py) (needs ~7GB GPU memory) + +## Finetuning + +The following demos use the Image captioning task: + +- [PEFT (LORA) finetuning (notebook)](finetune_image_captioning_peft.ipynb) (fits on Google colab) +- [Normal finetuning](finetune_image_captioning.py) (needs ~40GB GPU memory) diff --git a/examples/idefics/finetune_image_captioning.py b/examples/idefics/finetune_image_captioning.py new file mode 100644 index 00000000..9786d8d5 --- /dev/null +++ b/examples/idefics/finetune_image_captioning.py @@ -0,0 +1,131 @@ +# adapted from https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/pytorch/image_captioning.ipynb + +# This example demonstrates normal finetuning (w/o peft) - for the sake of keeping the memory +# requirements small it freezes the original pre-trained text and image layers to keep the memory +# requirements to just 40GB. If you have multiple GPUs then you can remove the unfreeze part to +# finetune the whole model. Alternatively use the PEFT solution as shown in +# IDEFICS_finetuning_demo.ipynb notebook which requires only 20GB to finetune the whole model. + +import torch +import torchvision.transforms as transforms + +from datasets import load_dataset +from PIL import Image +from transformers import IdeficsForVisionText2Text, AutoProcessor, Trainer, TrainingArguments + +device = "cuda" if torch.cuda.is_available() else "cpu" + +checkpoint = "HuggingFaceM4/idefics-9b" +# checkpoint = "HuggingFaceM4/tiny-random-idefics" + +processor = AutoProcessor.from_pretrained(checkpoint) +model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16).to(device) + +# freeze the original text and vision models and finetune only the layers added by IDEFICS +# you can unfreeze the whole model, but it'll require multiple gpus to finetune +model.model.freeze_text_layers() +model.model.freeze_vision_layers() + +# help util +def check_inference(): + url = "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/pokemon.png" + prompts = [ + url, + "Question: What's on the picture? Answer:", + ] + + inputs = processor(prompts, return_tensors="pt").to(device) + generated_ids = model.generate(**inputs, max_length=150) + generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + print(generated_text) + +# check generation before finetuning +check_inference() +# well, actually it looks like the model is already aware of pokemon - but this dataset will refine it further + +# finetune the model on the pokemon types dataset +ds = load_dataset("GabeHD/pokemon-type-captions") +ds = ds["train"].train_test_split(test_size=0.1) +train_ds = ds["train"] +eval_ds = ds["test"] + +def convert_to_rgb(image): + # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background + # for transparent images. The call to `alpha_composite` handles this case + if image.mode == "RGB": + return image + + image_rgba = image.convert("RGBA") + background = Image.new("RGBA", image_rgba.size, (255, 255, 255)) + alpha_composite = Image.alpha_composite(background, image_rgba) + alpha_composite = alpha_composite.convert("RGB") + return alpha_composite + +def ds_transforms(example_batch): + image_size = processor.image_processor.image_size + image_mean = processor.image_processor.image_mean + image_std = processor.image_processor.image_std + + image_transform = transforms.Compose([ + convert_to_rgb, + transforms.RandomResizedCrop((image_size, image_size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC), + transforms.ToTensor(), + transforms.Normalize(mean=image_mean, std=image_std), + ]) + + prompts = [] + for i in range(len(example_batch)): + prompts.append( + [ + example_batch["image"][i], + f"Question: What's on the picture? Answer: {example_batch['text'][i]}\n", + ], + ) + + inputs = processor(prompts, transform=image_transform, return_tensors="pt").to(device) + + inputs["labels"] = inputs["input_ids"] + + return inputs + +train_ds.set_transform(ds_transforms) +eval_ds.set_transform(ds_transforms) + +model_name = checkpoint.split("/")[1] + +# this setup requires about 40GB of gpu memory +training_args = TrainingArguments( + output_dir=f"{model_name}-pokemon", + learning_rate=5e-6, + num_train_epochs=10, + bf16=True, + per_device_train_batch_size=32, + per_device_eval_batch_size=32, + gradient_accumulation_steps=2, + dataloader_pin_memory=False, + save_total_limit=3, + evaluation_strategy="steps", + save_strategy="steps", + save_steps=1000, # don't save until ready... + eval_steps=40, + logging_steps=40, + remove_unused_columns=False, + push_to_hub=False, + label_names=["labels"], + load_best_model_at_end=True, + report_to=None, +) + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_ds, + eval_dataset=eval_ds, +) + +trainer.train() + +# check generation again after finetuning +check_inference() + +# after finetuning ideally we want generate to produce something like: a drawing of a pink and blue pokemon diff --git a/examples/idefics/finetune_image_captioning_peft.ipynb b/examples/idefics/finetune_image_captioning_peft.ipynb new file mode 100644 index 00000000..0a575cc0 --- /dev/null +++ b/examples/idefics/finetune_image_captioning_peft.ipynb @@ -0,0 +1,3347 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "UNCNPVi8iAgw" + }, + "source": [ + "# IDEFICS: A Flamingo-based model, trained at scale for the community\n", + "# Finetuning Demo Notebook:\n", + "\n", + "
\n", + "
\n", + "
\n", + " \"Idefics\n", + "
\n", + "\n", + "Credit: [Flamingo blog](https://www.deepmind.com/blog/tackling-multiple-tasks-with-a-single-visual-language-model)\n", + "\n", + "This google colab notebook shows how to run predictions with the 4-bit quantized πŸ€— [Idefics-9B model](https://huggingface.co/HuggingFaceM4/idefics-9b) and finetune it on a specific dataset.\n", + "\n", + "[IDEFICS](https://huggingface.co/HuggingFaceM4/idefics-80b) is a multi-modal model based on the [Flamingo](https://arxiv.org/abs/2204.14198) architecture. It can take images and texts as input and return text outputs but it does not support image generation. \\\\\n", + "IDEFICS is built on top of two unimodal open-access pre-trained models to connect the two modalities. Newly initialized parameters in the form of Transformer blocks bridge the gap between the vision encoder and the language model. The model is trained on a mixture of image/text pairs and unstrucutred multimodal web documents. \\\\\n", + "The [finetuned versions](https://huggingface.co/HuggingFaceM4/idefics-80b-instruct) of IDEFICS behave like LLM chatbots while also understanding visual input. \\\\\n", + "You can play with the [demo here](https://huggingface.co/spaces/HuggingFaceM4/idefics_playground)\n", + "\n", + "The code for this notebook was contributed to by *LΓ©o Tronchon, Younes Belkada, and Stas Bekman*, the IDEFICS model has been contributed to by: *Lucile Saulnier, LΓ©o Tronchon, Hugo LaurenΓ§on, Stas Bekman, Amanpreet Singh, Siddharth Karamcheti, and Victor Sanh*" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Install and import necessary libraries" + ], + "metadata": { + "id": "7m9zw1wcCC8e" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install -q datasets\n", + "!pip install -q git+https://github.com/huggingface/transformers.git@add-model-idefics\n", + "!pip install -q bitsandbytes sentencepiece accelerate loralib\n", + "!pip install -q -U git+https://github.com/huggingface/peft.git" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "prXRsUiXCII9", + "outputId": "3b9da6dd-365b-484d-9d37-a723eee947de" + }, + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m519.3/519.3 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m10.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m17.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m31.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for transformers (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.6/92.6 MB\u001b[0m \u001b[31m11.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m67.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m244.2/244.2 kB\u001b[0m \u001b[31m25.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Building wheel for peft (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "MxoHmx-HfAgf" + }, + "outputs": [], + "source": [ + "import torch\n", + "from datasets import load_dataset\n", + "from peft import LoraConfig, get_peft_model\n", + "from PIL import Image\n", + "from transformers import IdeficsForVisionText2Text, AutoProcessor, Trainer, TrainingArguments, BitsAndBytesConfig\n", + "import torchvision.transforms as transforms" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DP_ilre6jI6l" + }, + "source": [ + "# Load quantized model\n", + "First get the quantized version of the model. This will allow us to use the 9B version of Idefics with a single 16GB gpu\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 84, + "referenced_widgets": [ + "cf454254fbc74724a6909e60d82f86a3", + "561b1b43dbc1484784ea2abed7278c08", + "996e2ae7de594ccc968ce83382786365", + "7e72c1fdf039470f8b14859034c7942f", + "f34958207dca46fd9aa044912ec9fddb", + "0fa55920c3a54b30aca74aa7247fe2ea", + "119ec52a3ce54b0d9565a0d44e731850", + "27e2b5c562174873bb966f1408727058", + "008e6d4c958149819fd7e64e30f79e39", + "9302d5fbae224b999a0c3fcb3f34beb3", + "8c82d2f9f97047478d8399b2aee3389f" + ] + }, + "id": "IRiT0q0Ck-3Y", + "outputId": "52bc69ec-32ec-45d7-b1a2-1a7af0539506" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/transformers/models/auto/processing_auto.py:203: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/19 [00:00\", \"\"]\n", + " if len(bad_words) > 0:\n", + " bad_words_ids = tokenizer(bad_words, add_special_tokens=False).input_ids\n", + "\n", + " eos_token = \"\"\n", + " eos_token_id = tokenizer.convert_tokens_to_ids(eos_token)\n", + "\n", + " inputs = processor(prompts, return_tensors=\"pt\").to(device)\n", + " generated_ids = model.generate(**inputs, eos_token_id=[eos_token_id], bad_words_ids=bad_words_ids, max_new_tokens=max_new_tokens, early_stopping=True)\n", + " generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]\n", + " print(generated_text)" + ], + "metadata": { + "id": "J5MSZ3xdPF4f" + }, + "execution_count": 16, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RYA2HKGC0n9d" + }, + "source": [ + "\n", + "Let's run prediction with the quantized model for the image below which pictures two kittens. \\\\\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "id": "6I_iDtQN03jE", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "a4a77c65-186a-45e0-f819-3ea3d9d319c0" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Question: What's on the picture? Answer: Two kittens.\n" + ] + } + ], + "source": [ + "url = \"https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg\"\n", + "prompts = [\n", + " # \"Instruction: provide an answer to the question. Use the image to answer.\\n\",\n", + " url,\n", + " \"Question: What's on the picture? Answer:\",\n", + "]\n", + "check_inference(model, processor, prompts, max_new_tokens=5)\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Now let's see how the model fares on pokemon knowledge before we try to finetune it further. \\\\\n", + "\n" + ], + "metadata": { + "id": "DLiwPnGBxiJf" + } + }, + { + "cell_type": "code", + "source": [ + "# check generation before finetuning\n", + "\n", + "url = \"https://images.pokemontcg.io/pop6/2_hires.png\"\n", + "prompts = [\n", + " url,\n", + " \"Question: What's on the picture? Answer:\",\n", + "]\n", + "check_inference(model, processor, prompts, max_new_tokens=100)\n", + "# It looks like the model is already aware of pokemon - but it could be more specific, and less repetitive" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lDVDUE1ew7tZ", + "outputId": "37ba5c61-c607-4282-e57b-25cada593391" + }, + "execution_count": 18, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Question: What's on the picture? Answer: Lucario\n", + "\n", + "Lucario is a PokΓ©mon that is a combination of a bear and a lion. It is a PokΓ©mon that is a combination of a bear and a lion. It is a PokΓ©mon that is a combination of a bear and a lion. It is a PokΓ©mon that is a combination of a bear and a lion. It is a PokΓ©mon that is a combination of a bear and a lion. It is a Pok\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Finetuning dataset\n", + "Prepare the dataset that will be used for finetuning\n" + ], + "metadata": { + "id": "ydBhQT6SQiWy" + } + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 177, + "referenced_widgets": [ + "eac0761e22a84275aaee5d7ec7929da6", + "ba24eb82f1194ecab3514466eca8a2b8", + "52997c23e16a4f8aa220909e99b5452e", + "6b7767dc6c5b45a89f7becfe5fcf81d7", + "050b365a82b0412b83918f9f9603bf2f", + "39c0d7023e574db9a55eb7e82913d4ed", + "9bdbd4871dcd49a5bbfaa86b813e9a36", + "7bfee1d1c4134316af5b82cd354457ba", + "594fd06a2b07443a9ce27200468d5fe3", + "55de5af50af247cd93da17057661fd6c", + "450f2b15f9df4f72b23c4f916bc18f3b", + "22df8e4fce3b470b94fdce6e7b77a9ea", + "cd89e195d2bb4537889ec8cc9e7a815e", + "196951cc2fdd43d4a153de2666067cd0", + "aaf9e7678c174fe8820c5c0bebb6bb1e", + "699f568cedd846f590efa2500dd8b3a9", + "a0f3836eb674483295fbb147065b74fc", + "da922717666a496da59a4cf8840e6554", + "cc9ddc6c56324dd59cfb8bc9649fea28", + "830ec2345d9a4be88b486ad24bfc3b10", + "05b63fe3c99c417fb6bcdb450081bff8", + "e2872821e4e84271b32b8c8c8c093bfc", + "782d656769144ef9b48a3a37de81abb5", + "eb2f4bcb78534f4d9f9e2ccb52e738b7", + "a2bcf8164d904dcbada2196189b332be", + "99b5b2cd3f104c72b5ee880fe1d0e9b9", + "3197f87aadd5422cbb9804b0843ffc48", + "5cdf7a7b08cc46f5a4b2da143ba39bb6", + "8f335a7d85574c11b183fb700aeac5c3", + "6b96186a1ccb4e24b491b5849ac90c50", + "3a845c0efe954da1a47e77740f8623ff", + "4c8f47c325a54f52abab545362f36c43", + "7c1dc629e6dc4048b1b88a224c9a352d", + "da84172eaff34e61ac902681dbd364ca", + "2796bada5f6748b6af59f6b14b0957af", + "400b852ef365473cad76663421954c86", + "fd58bb90108a4486967a217eb3bc4389", + "b96a2d9afc324a4eb52f7a04caab630a", + "7c20b8d8e3b14504bba903e68d043e79", + "c8bc395e18e14492ae40ec6ff21a18d1", + "6c85b036e1be434faa2d515bed62e228", + "da15ec7761a847678dc696b214c67ada", + "03d2d213eb2a4c819bbcf8457e11904b", + "f651fffdc274473a85ed701097afaa1f", + "3fbc282a30cc49b99f335216df028cd6", + "651249802d0249479eb1700e600f9a5a", + "31e2d7d5057a4dfa96a65888697e9923", + "cbaf9ba59da24341a933c3c7473a3b7d", + "ee26c8314e6742a88cd59429f3d5b745", + "dd9e81eb4e3d45cca5c6e2b1e6cf335d", + "a7c9efe8c49a43d0ba6929bada9f78c2", + "15d3af1073fe4447847d0e6f3543f953", + "e4daf9a3e9e14e93ab55b91da59ecc9b", + "3557bb8fc4064fdf99ca2a1ec5469cff", + "b2ccec96efa1415fa4623ec8fa0f2c21" + ] + }, + "id": "5iZAz655m8Q9", + "outputId": "6524cedf-f0f1-43fa-d5dc-2b4f2d8f6eb1" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Downloading readme: 0%| | 0.00/2.77k [00:00\",\n", + " ],\n", + " )\n", + "\n", + " inputs = processor(prompts, transform=image_transform, return_tensors=\"pt\").to(device)\n", + "\n", + " inputs[\"labels\"] = inputs[\"input_ids\"]\n", + "\n", + " return inputs\n", + "\n", + "\n", + "# load and prepare dataset\n", + "ds = load_dataset(\"TheFusion21/PokemonCards\")\n", + "ds = ds[\"train\"].train_test_split(test_size=0.002)\n", + "train_ds = ds[\"train\"]\n", + "eval_ds = ds[\"test\"]\n", + "train_ds.set_transform(ds_transforms)\n", + "eval_ds.set_transform(ds_transforms)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# LoRA\n", + "After specifying the low-rank adapters (LoRA) config, we load the PeftModel using the get_peft_model utility function" + ], + "metadata": { + "id": "Kui4EkCmOQzd" + } + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "id": "jKa5oTorp_A-" + }, + "outputs": [], + "source": [ + "model_name = checkpoint.split(\"/\")[1]\n", + "config = LoraConfig(\n", + " r=16,\n", + " lora_alpha=32,\n", + " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\"],\n", + " lora_dropout=0.05,\n", + " bias=\"none\",\n", + ")\n", + "model = get_peft_model(model, config)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ShuZJ5K2pYoL", + "outputId": "6c22299b-5584-4994-c906-e9d031b40ad1" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "trainable params: 19,750,912 || all params: 8,949,430,544 || trainable%: 0.2206946230030432\n" + ] + } + ], + "source": [ + "model.print_trainable_parameters()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Training\n", + "Finally, using the Hugging Face Trainer, we can finetune the model! \\\\\n", + "For the sake of the demo, we have set the max_steps at 40. That's about 0.05 epoch on this dataset, so feel free to tune further!" + ], + "metadata": { + "id": "0Ok1sOZKQ29s" + } + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 155 + }, + "id": "9cD3OuygpR5l", + "outputId": "a8238139-59c3-49cb-c654-4aacb010dd7a" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [40/40 06:32, Epoch 0/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining LossValidation Loss
201.4500000.880157
400.7020000.675355

" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "TrainOutput(global_step=40, training_loss=1.0759869813919067, metrics={'train_runtime': 403.1999, 'train_samples_per_second': 1.587, 'train_steps_per_second': 0.099, 'total_flos': 1445219210656320.0, 'train_loss': 1.0759869813919067, 'epoch': 0.05})" + ] + }, + "metadata": {}, + "execution_count": 23 + } + ], + "source": [ + "training_args = TrainingArguments(\n", + " output_dir=f\"{model_name}-pokemon\",\n", + " learning_rate=2e-4,\n", + " fp16=True,\n", + " per_device_train_batch_size=2,\n", + " per_device_eval_batch_size=2,\n", + " gradient_accumulation_steps=8,\n", + " dataloader_pin_memory=False,\n", + " save_total_limit=3,\n", + " evaluation_strategy=\"steps\",\n", + " save_strategy=\"steps\",\n", + " save_steps=40,\n", + " eval_steps=20,\n", + " logging_steps=20,\n", + " max_steps=40,\n", + " remove_unused_columns=False,\n", + " push_to_hub=False,\n", + " label_names=[\"labels\"],\n", + " load_best_model_at_end=True,\n", + " report_to=None,\n", + " optim=\"paged_adamw_8bit\",\n", + ")\n", + "\n", + "trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=train_ds,\n", + " eval_dataset=eval_ds,\n", + ")\n", + "\n", + "trainer.train()" + ] + }, + { + "cell_type": "code", + "source": [ + "# check generation again after finetuning\n", + "check_inference(model, processor, prompts, max_new_tokens=100)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "v6NZ47vYTr-z", + "outputId": "8807a1dc-e37e-4c36-da02-507029a546ab" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Question: What's on the picture? Answer: This is Lucario. A Stage 2 Pokemon Card of type Fighting with the title Lucario and 90 HP of rarity Rare evolved from Pikachu from the set Neo Destiny and the flavor text: It can use its tail as a whip\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Push your new model to the hub!\n" + ], + "metadata": { + "id": "zgqonle8AdPs" + } + }, + { + "cell_type": "code", + "source": [ + "# Insert your \"write\" token. You should find it in the settings of your HF profile\n", + "!huggingface-cli login" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KrnB4kFxAjIA", + "outputId": "8370ee48-9b3d-446b-b69a-c3cec93f61fd" + }, + "execution_count": 28, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + " _| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|\n", + " _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|\n", + " _|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|\n", + " _| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|\n", + " _| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|\n", + " \n", + " A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.\n", + " Setting a new token will erase the existing one.\n", + " To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .\n", + "Token: \n", + "Add token as git credential? (Y/n) Y\n", + "Token is valid (permission: write).\n", + "\u001b[1m\u001b[31mCannot authenticate through git-credential as no helper is defined on your machine.\n", + "You might have to re-authenticate when pushing to the Hugging Face Hub.\n", + "Run the following command in your terminal in case you want to set the 'store' credential helper as default.\n", + "\n", + "git config --global credential.helper store\n", + "\n", + "Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.\u001b[0m\n", + "Token has not been saved to git credential helper.\n", + "Your token has been saved to /root/.cache/huggingface/token\n", + "Login successful\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "model.push_to_hub(f\"{model_name}-pokemon\", private=False)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 66, + "referenced_widgets": [ + "73bdfdf8980d4c358c90d574eb91bef5", + "3f49f9009fa14fe3b87bb123491a4b0f", + "548fc33764964fe9a0498194df85b768", + "e60ce018bf3a4b15941062300143e2a3", + "d0a78497d9694dc6b7e903392daf6a26", + "5b585e82891a40b0826679a79583ee7c", + "db9f5a1c1a0a49b3b58a30f0a74c3329", + "641cf05e799e4ae89ec84fdf8c225b93", + "cf937013fade482f90bd599eced8bfb4", + "ee49f8d2b11f43e2bb30d27407744ed3", + "ef2f2655d7b9432f983ae508f6dd4e0b" + ] + }, + "id": "_jFKg3iP172d", + "outputId": "2b58ecb2-fe97-4a6c-bd2c-7fdaaf03e99a" + }, + "execution_count": 29, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "adapter_model.bin: 0%| | 0.00/79.2M [00:00