diff --git a/.ci/spellcheck/.pyspelling.wordlist.txt b/.ci/spellcheck/.pyspelling.wordlist.txt index be4d45cc5aa..5152ea9b9c5 100644 --- a/.ci/spellcheck/.pyspelling.wordlist.txt +++ b/.ci/spellcheck/.pyspelling.wordlist.txt @@ -339,6 +339,7 @@ GenerationMixin Girshick Gitee GitHub +GGUF GLA GLM glm @@ -954,6 +955,7 @@ SML sml SmolDocling SmolVLM +SmolLM softmax softvc SoftVC diff --git a/notebooks/llm-question-answering/config.py b/notebooks/llm-question-answering/config.py index 4562de6e4b5..50d114bf9b5 100644 --- a/notebooks/llm-question-answering/config.py +++ b/notebooks/llm-question-answering/config.py @@ -22,4 +22,9 @@ "end_key": "<|eot_id|>", "prompt_template": "<|start_header_id|>system<|end_header_id|>\n\nBelow is an instruction that describes a task. Write a response that appropriately completes the request.<|eot_id|><|start_header_id|>user<|end_header_id|>Instruction: {instruction} Answer:<|eot_id|><|start_header_id|>assistant<|end_header_id|>", }, + "Qwen2.5-0.5B-Instruct-GGUF": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct-GGUF", + "gguf_file": ["qwen2.5-0.5b-instruct-fp16.gguf", "qwen2.5-0.5b-instruct-q4_0.gguf", "qwen2.5-0.5b-instruct-q8_0.gguf"], + }, + "SmolLM2-135M-GGUF": {"model_id": "prithivMLmods/SmolLM2-135M-GGUF", "gguf_file": ["SmolLM2-135M.F16.gguf", "SmolLM2-135M.Q8_0.gguf"]}, } diff --git a/notebooks/llm-question-answering/llm-question-answering.ipynb b/notebooks/llm-question-answering/llm-question-answering.ipynb index c6bd700e607..ce12a9f1f5f 100644 --- a/notebooks/llm-question-answering/llm-question-answering.ipynb +++ b/notebooks/llm-question-answering/llm-question-answering.ipynb @@ -20,16 +20,18 @@ "- Create an instruction-following inference pipeline with [Generate API](https://github.com/openvinotoolkit/openvino.genai)\n", "- Run instruction-following pipeline\n", "\n", - "\n", "#### Table of contents:\n", "\n", "- [Prerequisites](#Prerequisites)\n", "- [Select model for inference](#Select-model-for-inference)\n", - "- [Download and convert model to OpenVINO IR via Optimum Intel CLI](#Download-and-convert-model-to-OpenVINO-IR-via-Optimum-Intel-CLI)\n", - "- [Compress model weights](#Compress-model-weights)\n", + "- [login to huggingfacehub to get access to pretrained model](#login-to-huggingfacehub-to-get-access-to-pretrained-model)\n", + "- [Select device for inference and model variant](#Select-device-for-inference-and-model-variant)\n", + "- [Download and convert models](#Download-and-convert-models)\n", + " - [Convert model to OpenVINO IR via Optimum Intel CLI](#Convert-model-to-OpenVINO-IR-via-Optimum-Intel-CLI)\n", + " - [Compress model weights](#Compress-model-weights)\n", " - [Weights Compression using Optimum Intel CLI](#Weights-Compression-using-Optimum-Intel-CLI)\n", " - [Weights Compression using NNCF](#Weights-Compression-using-NNCF)\n", - "- [Select device for inference and model variant](#Select-device-for-inference-and-model-variant)\n", + " - [Download models](#Download-models)\n", "- [Create an instruction-following inference pipeline](#Create-an-instruction-following-inference-pipeline)\n", " - [Setup imports](#Setup-imports)\n", " - [Prepare text streamer to get results runtime](#Prepare-text-streamer-to-get-results-runtime)\n", @@ -66,7 +68,7 @@ "outputs": [], "source": [ "%pip uninstall -q -y optimum optimum-intel\n", - "%pip install -Uq \"openvino>=2024.3.0\" \"openvino-genai\"\n", + "%pip install -Uq \"openvino>=2025.3.0\" \"openvino-genai\"\n", "%pip install -q \"torch>=2.1\" \"nncf>=2.7\" \"transformers>=4.40.0\" \"huggingface-hub>=0.26.5\" \"onnx<1.16.2\" \"optimum>=1.16.1\" \"accelerate\" \"datasets>=2.14.6,<4.0.0\" \"gradio>=4.19\" \"git+https://github.com/huggingface/optimum-intel.git\" --extra-index-url https://download.pytorch.org/whl/cpu" ] }, @@ -89,12 +91,16 @@ "* **red-pajama-3b-instruct** - A 2.8B parameter pre-trained language model based on GPT-NEOX architecture. The model was fine-tuned for few-shot applications on the data of [GPT-JT](https://huggingface.co/togethercomputer/GPT-JT-6B-v1), with exclusion of tasks that overlap with the HELM core scenarios.More details about model can be found in [model card](https://huggingface.co/togethercomputer/RedPajama-INCITE-Instruct-3B-v1).\n", "* **mistral-7b** - The Mistral-7B-v0.2 Large Language Model (LLM) is a pretrained generative text model with 7 billion parameters. You can find more details about model in the [model card](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2), [paper](https://arxiv.org/abs/2310.06825) and [release blog post](https://mistral.ai/news/announcing-mistral-7b/).\n", "* **llama-3-8b-instruct** - Llama 3 is an auto-regressive language model that uses an optimized transformer architecture. The tuned versions use supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF) to align with human preferences for helpfulness and safety. The Llama 3 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks. More details about model can be found in [Meta blog post](https://ai.meta.com/blog/meta-llama-3/), [model website](https://llama.meta.com/llama3) and [model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct).\n", + "* **Qwen2.5-0.5B-Instruct-GGUF** - Qwen2.5 is the latest series of Qwen large language models. This model version has significant improvements in instruction following, generating long texts (over 8K tokens), understanding structured data (e.g, tables), and generating structured outputs especially JSON. It more resilient to the diversity of system prompts, enhancing role-play implementation and condition-setting for chatbots. The model supports long-context up to 128K tokens and can generate up to 8K tokens. And also it has multilingual support for over 29 languages, including Chinese, English, French, Spanish, Portuguese, German, Italian, Russian, Japanese, Korean, Vietnamese, Thai, Arabic, and more. Models in GGUF format and more details about it can be found in [model card](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF).\n", + "* **SmolLM2-135M-GGUF** - SmolLM2 is a family of compact language models. They are capable of solving a wide range of tasks while being lightweight enough to run on-device. This version of SmolLM models demonstrates significant advances in instruction following, knowledge, reasoning. Models in GGUF format and more details about it can be found in [model card](https://huggingface.co/prithivMLmods/SmolLM2-135M-GGUF).\n", ">**Note**: run model with demo, you will need to accept license agreement. \n", ">You must be a registered user in 🤗 Hugging Face Hub. Please visit [HuggingFace model card](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf), carefully read terms of usage and click accept button. You will need to use an access token for the code below to run. For more information on access tokens, refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).\n", ">You can login on Hugging Face Hub in notebook environment, using following code:\n", " \n", "```python\n", " ## login to huggingfacehub to get access to pretrained model \n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", "\n", " from huggingface_hub import notebook_login, whoami\n", "\n", @@ -187,13 +193,22 @@ "device" ] }, + { + "cell_type": "markdown", + "id": "fe70aa5a", + "metadata": {}, + "source": [ + "## Download and convert models\n", + "[back to top ⬆️](#Table-of-contents:)" + ] + }, { "attachments": {}, "cell_type": "markdown", "id": "4e4fd394-b4fb-4eef-8bdc-d116572aa8f0", "metadata": {}, "source": [ - "## Download and convert model to OpenVINO IR via Optimum Intel CLI\n", + "### Convert model to OpenVINO IR via Optimum Intel CLI\n", "[back to top ⬆️](#Table-of-contents:)\n", "\n", "Listed model are available for downloading via the [HuggingFace hub](https://huggingface.co/models). We will use optimum-cli interface for exporting it into OpenVINO Intermediate Representation (IR) format.\n", @@ -207,7 +222,7 @@ "\n", "where `--model` argument is model id from HuggingFace Hub or local directory with model (saved using `.save_pretrained` method), `--task ` is one of [supported task](https://huggingface.co/docs/optimum/exporters/task_manager) that exported model should solve. For LLMs it will be `text-generation-with-past`. If model initialization requires to use remote code, `--trust-remote-code` flag additionally should be passed. Full list of supported arguments available via `--help` For more details and examples of usage, please check [optimum documentation](https://huggingface.co/docs/optimum/intel/inference#export).\n", "\n", - "## Compress model weights\n", + "### Compress model weights\n", "[back to top ⬆️](#Table-of-contents:)\n", "\n", "The Weights Compression algorithm is aimed at compressing the weights of the models and can be used to optimize the model footprint and performance of large models where the size of weights is relatively larger than the size of activations, for example, Large Language Models (LLM). Compared to INT8 compression, INT4 compression improves performance even more but introduces a minor drop in prediction quality.\n", @@ -228,40 +243,25 @@ ">**Note**: There may be no speedup for INT4/INT8 compressed models on dGPU" ] }, + { + "cell_type": "markdown", + "id": "7467f438", + "metadata": {}, + "source": [ + "### Download models\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "Some models have been converted and presented in [OpenVINO's models Collection](https://huggingface.co/collections/OpenVINO/llm-6687aaa2abca3bbcec71a9bd). These models can be loaded with `huggingface_hub`: `hf_hub_download(repo_id=model_id, local_dir=local_path)`.\n", + "\n", + "Also [GGUF model format is supported in OpenVINO GenAI](https://openvinotoolkit.github.io/openvino.genai/docs/samples/c/text_generation?_highlight=gguf#using-gguf-models). [GGUF](https://github.com/ggml-org/ggml/blob/master/docs/gguf.md) is a binary format that is optimized for quick loading and saving of models, making it highly efficient for inference purposes. Models in GGUF format also can be loaded with `huggingface_hub` module, it's needed to specify filename: `hf_hub_download(repo_id=model_id, filename=gguf_filename, local_dir=gguf_file_path)`." + ] + }, { "cell_type": "code", "execution_count": 5, "id": "f81602ca-4674-4b61-b2c8-ca11631428b1", "metadata": {}, "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "18f82da95c7048fe9377f0546e4e885a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Checkbox(value=True, description='Prepare INT4 model')" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "204a990e71c84893aab873b0c3fc2f0a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Checkbox(value=False, description='Prepare INT8 model')" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "application/vnd.jupyter.widget-view+json": { @@ -273,32 +273,44 @@ "Checkbox(value=False, description='Prepare FP16 model')" ] }, + "execution_count": 5, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ "from IPython.display import display, Markdown\n", "\n", - "prepare_int4_model = widgets.Checkbox(\n", - " value=True,\n", - " description=\"Prepare INT4 model\",\n", - " disabled=False,\n", - ")\n", - "prepare_int8_model = widgets.Checkbox(\n", - " value=False,\n", - " description=\"Prepare INT8 model\",\n", - " disabled=device.value == \"NPU\",\n", - ")\n", - "prepare_fp16_model = widgets.Checkbox(\n", - " value=False,\n", - " description=\"Prepare FP16 model\",\n", - " disabled=False,\n", - ")\n", + "quatization_levels = {}\n", + "if \"gguf_file\" in model_configuration:\n", + " for gguf_file in model_configuration[\"gguf_file\"]:\n", + " if device.value == \"NPU\" or \"fp16\" in gguf_file.lower() or \"f16\" in gguf_file.lower():\n", + " quatization_levels[gguf_file] = widgets.Checkbox(\n", + " value=True, description=f\"Prepare {gguf_file} model\", disabled=False, layout=widgets.Layout(width=\"auto\")\n", + " )\n", + " else:\n", + " quatization_levels[gguf_file] = widgets.Checkbox(\n", + " value=False, description=f\"Prepare {gguf_file} model\", disabled=False, layout=widgets.Layout(width=\"auto\")\n", + " )\n", + "else:\n", + " quatization_levels[\"INT4\"] = widgets.Checkbox(\n", + " value=True,\n", + " description=\"Prepare INT4 model\",\n", + " disabled=False,\n", + " )\n", + " quatization_levels[\"INT8\"] = widgets.Checkbox(\n", + " value=False,\n", + " description=\"Prepare INT8 model\",\n", + " disabled=device.value == \"NPU\",\n", + " )\n", + " quatization_levels[\"FP16\"] = widgets.Checkbox(\n", + " value=False,\n", + " description=\"Prepare FP16 model\",\n", + " disabled=False,\n", + " )\n", "\n", - "display(prepare_int4_model)\n", - "display(prepare_int8_model)\n", - "display(prepare_fp16_model)" + "quatization_level_selections = widgets.VBox(list(quatization_levels.values()))\n", + "quatization_level_selections" ] }, { @@ -321,6 +333,8 @@ "import openvino as ov\n", "import nncf\n", "\n", + "from huggingface_hub import hf_hub_download\n", + "\n", "nncf.set_log_level(logging.ERROR)\n", "\n", "pt_model_id = model_configuration[\"model_id\"]\n", @@ -329,9 +343,16 @@ "int4_model_dir = Path(model_id.value) / \"INT4_compressed_weights\"\n", "int4_npu_friendly = Path(model_id.value) / \"INT4_NPU_compressed_weights\"\n", "\n", + "gguf_file_path = Path(model_id.value) / \"gguf_models\"\n", + "\n", "core = ov.Core()\n", "\n", "\n", + "def load_gguf_model(gguf_filename):\n", + " if not (gguf_file_path / gguf_filename).exists():\n", + " hf_hub_download(repo_id=pt_model_id, filename=gguf_filename, local_dir=gguf_file_path)\n", + "\n", + "\n", "def convert_to_fp16():\n", " if (fp16_model_dir / \"openvino_model.xml\").exists():\n", " return\n", @@ -373,7 +394,7 @@ " },\n", " }\n", "\n", - " int4_result_model_dir = int4_model_dir if device.value != \"NPU\" else int4_npu_friendly\n", + " int4_result_model_full_path = int4_model_dir if device.value != \"NPU\" else int4_npu_friendly\n", "\n", " model_compression_params = compression_configs.get(model_id.value, compression_configs[\"default\"])\n", " if device.value == \"NPU\":\n", @@ -381,25 +402,30 @@ " model_compression_params[\"sym\"] = True\n", " model_compression_params[\"ratio\"] = 1.0\n", "\n", - " if (int4_result_model_dir / \"openvino_model.xml\").exists():\n", + " if (int4_result_model_full_path / \"openvino_model.xml\").exists():\n", " return\n", " export_command_base = \"optimum-cli export openvino --model {} --task text-generation-with-past --weight-format int4\".format(pt_model_id)\n", " int4_compression_args = \" --group-size {} --ratio {}\".format(model_compression_params[\"group_size\"], model_compression_params[\"ratio\"])\n", " if model_compression_params[\"sym\"]:\n", " int4_compression_args += \" --sym\"\n", " export_command_base += int4_compression_args\n", - " export_command = export_command_base + \" \" + str(int4_result_model_dir)\n", + " export_command = export_command_base + \" \" + str(int4_result_model_full_path)\n", " display(Markdown(\"**Export command:**\"))\n", " display(Markdown(f\"`{export_command}`\"))\n", " ! $export_command\n", "\n", "\n", - "if prepare_fp16_model.value:\n", - " convert_to_fp16()\n", - "if prepare_int8_model.value:\n", - " convert_to_int8()\n", - "if prepare_int4_model.value:\n", - " convert_to_int4()" + "for i, (key, value) in enumerate(quatization_levels.items()):\n", + " if value.value:\n", + " if \"gguf\" in key:\n", + " load_gguf_model(key)\n", + " else:\n", + " if \"FP16\" in key:\n", + " convert_to_fp16()\n", + " if \"INT8\" in key:\n", + " convert_to_int8()\n", + " if \"INT4\" in key:\n", + " convert_to_int4()" ] }, { @@ -430,6 +456,10 @@ } ], "source": [ + "if gguf_file_path.exists():\n", + " for model_file in gguf_file_path.glob(\"*.gguf\"):\n", + " print(f\"Size of model with {model_file} weights is {model_file.stat().st_size / 1024 / 1024:.2f} MB\")\n", + "\n", "fp16_weights = fp16_model_dir / \"openvino_model.bin\"\n", "int8_weights = int8_model_dir / \"openvino_model.bin\"\n", "int4_weights = (int4_model_dir if not device.value == \"NPU\" else int4_npu_friendly) / \"openvino_model.bin\"\n", @@ -476,11 +506,11 @@ "if fp16_model_dir.exists():\n", " available_models.append(\"FP16\")\n", "\n", + "for model_file in gguf_file_path.glob(\"*.gguf\"):\n", + " available_models.append(model_file.name)\n", + "\n", "model_to_run = widgets.Dropdown(\n", - " options=available_models,\n", - " value=available_models[0],\n", - " description=\"Model to run:\",\n", - " disabled=False,\n", + " options=available_models, value=available_models[0], description=\"Model to run:\", disabled=False, layout=widgets.Layout(width=\"300px\")\n", ")\n", "\n", "model_to_run" @@ -497,19 +527,25 @@ "from openvino_tokenizers import convert_tokenizer\n", "\n", "if model_to_run.value == \"INT4\":\n", - " model_dir = int4_model_dir if device.value != \"NPU\" else int4_npu_friendly\n", + " model_full_path = int4_model_dir if device.value != \"NPU\" else int4_npu_friendly\n", "elif model_to_run.value == \"INT8\":\n", - " model_dir = int8_model_dir\n", + " model_full_path = int8_model_dir\n", + "elif model_to_run.value == \"FP16\":\n", + " model_full_path = fp16_model_dir\n", "else:\n", - " model_dir = fp16_model_dir\n", - "print(f\"Loading model from {model_dir}\")\n", + " model_full_path = gguf_file_path / model_to_run.value\n", + "print(f\"Loading model from {model_full_path}\")\n", "\n", "# optionally convert tokenizer if used cached model without it\n", - "if not (model_dir / \"openvino_tokenizer.xml\").exists() or not (model_dir / \"openvino_detokenizer.xml\").exists():\n", - " hf_tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)\n", + "if model_full_path.suffix != \".gguf\" and (\n", + " not (model_full_path / \"openvino_tokenizer.xml\").exists() or not (model_full_path / \"openvino_detokenizer.xml\").exists()\n", + "):\n", + " hf_tokenizer = AutoTokenizer.from_pretrained(model_full_path, trust_remote_code=True)\n", " ov_tokenizer, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True)\n", - " ov.save_model(ov_tokenizer, model_dir / \"openvino_tokenizer.xml\")\n", - " ov.save_model(ov_tokenizer, model_dir / \"openvino_detokenizer.xml\")" + " if not (model_full_path / \"openvino_tokenizer.xml\").exists():\n", + " ov.save_model(ov_tokenizer, model_full_path / \"openvino_tokenizer.xml\")\n", + " if not (model_full_path / \"openvino_detokenizer.xml\").exists():\n", + " ov.save_model(ov_detokenizer, model_full_path / \"openvino_detokenizer.xml\")" ] }, { @@ -539,7 +575,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "f1f295d4", "metadata": {}, "outputs": [ @@ -568,7 +604,7 @@ "source": [ "import openvino_genai as ov_genai\n", "\n", - "pipe = ov_genai.LLMPipeline(model_dir.as_posix(), device.value)\n", + "pipe = ov_genai.LLMPipeline(model_full_path.as_posix(), device.value)\n", "print(pipe.generate(\"The Sun is yellow bacause\", temperature=1.2, top_k=4, do_sample=True, max_new_tokens=150))" ] }, @@ -640,21 +676,18 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "e2638c5b-47ad-4213-80da-8cfc2659b3aa", "metadata": {}, "outputs": [], "source": [ "core = ov.Core()\n", "\n", - "detokinizer_dir = Path(model_dir, \"openvino_detokenizer.xml\")\n", - "\n", "\n", "class TextIteratorStreamer(ov_genai.StreamerBase):\n", " def __init__(self, tokenizer):\n", " super().__init__()\n", " self.tokenizer = tokenizer\n", - " self.compiled_detokenizer = core.compile_model(detokinizer_dir.as_posix())\n", " self.text_queue = Queue()\n", " self.stop_signal = None\n", "\n", @@ -669,8 +702,8 @@ " return value\n", "\n", " def put(self, token_id):\n", - " openvino_output = self.compiled_detokenizer(np.array([[token_id]], dtype=int))\n", - " text = str(openvino_output[\"string_output\"][0])\n", + " openvino_output = self.tokenizer.decode([token_id])\n", + " text = str(openvino_output)\n", " # remove labels/special symbols\n", " text = text.lstrip(\"!\")\n", " text = re.sub(\"<.*>\", \"\", text)\n",