diff --git a/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb b/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb new file mode 100644 index 00000000000..528676e9000 --- /dev/null +++ b/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb @@ -0,0 +1,1962 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Auto Alignment Evaluation of LLM Output\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "c0831e65d0de", + "tags": [] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "\n", + "```html\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "\n", + "```" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# TODO: REMOVE THIS CELL FROM YOUR NOTEBOOK ###\n", + "\n", + "import re\n", + "from urllib.parse import quote\n", + "\n", + "from IPython.display import Markdown, display\n", + "\n", + "\n", + "def generate_html(file_path: str):\n", + " match = re.search(\n", + " r\"(?:https://)?(?:github\\.com/)?(?:GoogleCloudPlatform/)?(?:generative-ai/)?(?:blob/)?(?:main/)?([\\w/-]+.ipynb)\",\n", + " file_path,\n", + " )\n", + " if not match:\n", + " return \"Could not generate table.\"\n", + "\n", + " file_path = match.group(1)\n", + "\n", + " base_url = \"https://github.com/GoogleCloudPlatform/generative-ai/blob/main/\"\n", + " raw_github_url = (\n", + " \"https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/\"\n", + " )\n", + "\n", + " colab_url = \"https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/\"\n", + " colab_enterprise_url = f\"https://console.cloud.google.com/vertex-ai/colab/import/{raw_github_url.replace('/', '%2F')}\"\n", + " vertex_ai_url = f\"https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url={raw_github_url}\"\n", + " bigquery_studio_url = (\n", + " f\"https://console.cloud.google.com/bigquery/import?url={base_url}\"\n", + " )\n", + "\n", + " linkedin_url = \"https://www.linkedin.com/sharing/share-offsite/?url=\"\n", + " bluesky_url = \"https://bsky.app/intent/compose?text=\"\n", + " twitter_url = \"https://twitter.com/intent/tweet?url=\"\n", + " reddit_url = \"https://reddit.com/submit?url=\"\n", + " facebook_url = \"https://www.facebook.com/sharer/sharer.php?u=\"\n", + "\n", + " encoded_url = quote(f\"{base_url}{file_path}\")\n", + "\n", + " html = f\"\"\"\n", + "```html\n", + "\n", + " \n", + " \n", + " \"\"\"\n", + "\n", + " # Add BigQuery Studio link only if the flag is True\n", + " if INCLUDE_BIGQUERY_STUDIO:\n", + " html += f\"\"\"\n", + " \"\"\"\n", + "\n", + " html += f\"\"\"\n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"BigQuery
Open in BigQuery Studio\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "Share to:\n", + "\n", + "\n", + " \"LinkedIn\n", + "\n", + "\n", + "\n", + " \"Bluesky\n", + "\n", + "\n", + "\n", + " \"X\n", + "\n", + "\n", + "\n", + " \"Reddit\n", + "\n", + "\n", + "\n", + " \"Facebook\n", + "\n", + "```\"\"\"\n", + " return html\n", + "\n", + "\n", + "# File path from the repository root\n", + "file_path = \"imported/PANW PSO/NEXT 2025/generative-ai/gemini/evaluation/auto_alignment_evaluation_of_llm_output.ipynb\" # @param {type:\"string\"}\n", + "\n", + "# Include link to Open in BigQuery Studio\n", + "INCLUDE_BIGQUERY_STUDIO = False # @param {type:\"boolean\"}\n", + "display(Markdown(generate_html(file_path)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| Author(s) |\n", + "| --- |\n", + "| [Jennifer Liang](https://github.com/jenniferliangc) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "This tutorial brings a new way to evaluate LLM performance against ground truth. We build a customizable, line-by-line automated evaluator for use cases where high precision is required. This method eliminates the need for repetitive prompt tuning, minimizes hallucinations, and ensures repeatable and accurate results.\n", + "\n", + "\n", + "In this tutorial, we use a recipe dataset that has ground truth and LLM outputs. We will perform two phases of evaluation:\n", + "\n", + "1. Rephraser Evaluator (0-1 point): use a semantic similarity model to check similarity on LLM output vs ground truth.\n", + "\n", + "2. Final Answer Evaluator (0-5 points): we have three criterias to evaluate in this phase:\n", + "> - a) Source Score (0-1 point): did the LLM choose the same source (source 1, 2, 3, etc.) as in the ground truth?\n", + "> - b) Ingredient Sentence Score (0-2 points): sentence-level comparison to check if LLM outputted the same ingredient list as in the ground truth. \n", + "> - c) Instruction Sentence Score (0-2 points): sentence-level comparison to check if LLM outputted the same instructions as in the ground truth.\n", + "> - For a, b, and c, penalties are added for any missed ground truth sources or sentences. A penalty is also added for any extra sources or sentences that the LLM produces not present in the ground truth" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Google Gen AI SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "tFy3H3aPgx12", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install --upgrade --quiet google-cloud-aiplatform" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information\n", + "\n", + "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "# Use the environment variable if the user doesn't provide Project ID.\n", + "import os\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n", + "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n", + "\n", + "from google import genai\n", + "\n", + "client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "### Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "6fc324893334", + "tags": [] + }, + "outputs": [], + "source": [ + "from IPython.display import Markdown, display\n", + "\n", + "import json\n", + "import logging\n", + "import time\n", + "import uuid\n", + "import pandas as pd\n", + "import numpy as np\n", + "import os, sys, vertexai\n", + "import regex as re\n", + "import json\n", + "import ast\n", + "import requests\n", + "from typing import NamedTuple\n", + "from google.cloud import aiplatform\n", + "\n", + "# Logging\n", + "logger = logging.getLogger(\"logger\")\n", + "logging.basicConfig(level=logging.WARNING)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Tutorial Start" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "### Load Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.read_csv(\"asian_chef_advisor_dataset.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
case_idqueryground_truth_rephrased_querygemini_rephrased_query_llmground_truth_final_answergemini_final_answer_llmgemini_alternative_answergemini_alternative_rephrased_query_llm
01I have a lot of tofu and I want to make a Chin...Sweet sour tofu bell pepper onion Chinese stir...Sweet and sour tofu stir-fry with bell peppers...Recipe 1: Mapo Tofu\\n\\nIngredients:\\n\\n½ cup o...Recipe 1: (From Document 1). Best Dish: Mapo T...Okay, I understand. I will analyze the search ...Tofu stir-fry recipe with sweet and sour sauce...
12I have some dried shiitake mushrooms and I'd l...Shiitake mushroom pork belly star anise braise...Shiitake mushroom, pork belly, star anise, soy...Best Dish: Braised Pork and Egg with Rice\\n\\nR...Recipe 1: Braised Pork and Egg with Rice\\n\\nIn...Recipe 1: SHANGHAI BRAISED PORK BELLY\\n\\nIngre...Chinese braised pork belly with shiitake mushr...
\n", + "
" + ], + "text/plain": [ + " case_id query \\\n", + "0 1 I have a lot of tofu and I want to make a Chin... \n", + "1 2 I have some dried shiitake mushrooms and I'd l... \n", + "\n", + " ground_truth_rephrased_query \\\n", + "0 Sweet sour tofu bell pepper onion Chinese stir... \n", + "1 Shiitake mushroom pork belly star anise braise... \n", + "\n", + " gemini_rephrased_query_llm \\\n", + "0 Sweet and sour tofu stir-fry with bell peppers... \n", + "1 Shiitake mushroom, pork belly, star anise, soy... \n", + "\n", + " ground_truth_final_answer \\\n", + "0 Recipe 1: Mapo Tofu\\n\\nIngredients:\\n\\n½ cup o... \n", + "1 Best Dish: Braised Pork and Egg with Rice\\n\\nR... \n", + "\n", + " gemini_final_answer_llm \\\n", + "0 Recipe 1: (From Document 1). Best Dish: Mapo T... \n", + "1 Recipe 1: Braised Pork and Egg with Rice\\n\\nIn... \n", + "\n", + " gemini_alternative_answer \\\n", + "0 Okay, I understand. I will analyze the search ... \n", + "1 Recipe 1: SHANGHAI BRAISED PORK BELLY\\n\\nIngre... \n", + "\n", + " gemini_alternative_rephrased_query_llm \n", + "0 Tofu stir-fry recipe with sweet and sour sauce... \n", + "1 Chinese braised pork belly with shiitake mushr... " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "### Rephraser Evaluator" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Function to calculate cosine similarity between two sentences\n", + "def calculate_similarity(llm_rephraser, gt_rephraser):\n", + " embeddings1 = model.encode(llm_rephraser, convert_to_tensor=True)\n", + " embeddings2 = model.encode(gt_rephraser, convert_to_tensor=True)\n", + " cosine_sim = util.cos_sim(embeddings1, embeddings2)\n", + " return cosine_sim.item()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df['rephraser_semantic_similarity'] = df.apply(lambda row: calculate_similarity(row['gemini_final_answer_llm'], row['ground_truth_rephrased_query']), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### Final Answer Evaluator" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "#### Source Scoring" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Extract Recipe Number Source\n", + "\n", + "def find_recipe_number(text):\n", + " \"\"\"\n", + " Finds and extracts unique recipe numbers from a given text string.\n", + "\n", + " The function searches for the pattern \"Recipe \" followed by one or more digits.\n", + " It returns a sorted list of the unique recipe numbers found in the text.\n", + " If the input text is NaN or if no recipe numbers are found, an empty list is returned.\n", + "\n", + " Args:\n", + " text (str): The input string to search for recipe numbers.\n", + "\n", + " Returns:\n", + " list: A sorted list of unique recipe numbers (as strings) found in the text.\n", + " Returns an empty list if no recipe numbers are found or if the input\n", + " text is pandas.NA.\n", + " \"\"\" \n", + " if pd.isna(text):\n", + " return []\n", + " else:\n", + " recipe_numbers = []\n", + " pattern = r\"Recipe (\\d+)\"\n", + " matches = re.findall(pattern, text)\n", + " if matches:\n", + " recipe_numbers.extend(matches)\n", + " return sorted(list(set(recipe_numbers)))\n", + " else:\n", + " return []" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df['gt_sources_extracted'] = df['ground_truth_final_answer'].apply(find_recipe_number)\n", + "df['llm_response_sources'] = df['gemini_final_answer_llm'].apply(find_recipe_number)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
case_idqueryground_truth_rephrased_querygemini_rephrased_query_llmground_truth_final_answergemini_final_answer_llmgemini_alternative_answergemini_alternative_rephrased_query_llmrephraser_semantic_similaritygt_sources_extractedllm_response_sources
01I have a lot of tofu and I want to make a Chin...Sweet sour tofu bell pepper onion Chinese stir...Sweet and sour tofu stir-fry with bell peppers...Recipe 1: Mapo Tofu\\n\\nIngredients:\\n\\n½ cup o...Recipe 1: (From Document 1). Best Dish: Mapo T...Okay, I understand. I will analyze the search ...Tofu stir-fry recipe with sweet and sour sauce...0.686584[1][1]
12I have some dried shiitake mushrooms and I'd l...Shiitake mushroom pork belly star anise braise...Shiitake mushroom, pork belly, star anise, soy...Best Dish: Braised Pork and Egg with Rice\\n\\nR...Recipe 1: Braised Pork and Egg with Rice\\n\\nIn...Recipe 1: SHANGHAI BRAISED PORK BELLY\\n\\nIngre...Chinese braised pork belly with shiitake mushr...0.731325[1][1]
\n", + "
" + ], + "text/plain": [ + " case_id query \\\n", + "0 1 I have a lot of tofu and I want to make a Chin... \n", + "1 2 I have some dried shiitake mushrooms and I'd l... \n", + "\n", + " ground_truth_rephrased_query \\\n", + "0 Sweet sour tofu bell pepper onion Chinese stir... \n", + "1 Shiitake mushroom pork belly star anise braise... \n", + "\n", + " gemini_rephrased_query_llm \\\n", + "0 Sweet and sour tofu stir-fry with bell peppers... \n", + "1 Shiitake mushroom, pork belly, star anise, soy... \n", + "\n", + " ground_truth_final_answer \\\n", + "0 Recipe 1: Mapo Tofu\\n\\nIngredients:\\n\\n½ cup o... \n", + "1 Best Dish: Braised Pork and Egg with Rice\\n\\nR... \n", + "\n", + " gemini_final_answer_llm \\\n", + "0 Recipe 1: (From Document 1). Best Dish: Mapo T... \n", + "1 Recipe 1: Braised Pork and Egg with Rice\\n\\nIn... \n", + "\n", + " gemini_alternative_answer \\\n", + "0 Okay, I understand. I will analyze the search ... \n", + "1 Recipe 1: SHANGHAI BRAISED PORK BELLY\\n\\nIngre... \n", + "\n", + " gemini_alternative_rephrased_query_llm \\\n", + "0 Tofu stir-fry recipe with sweet and sour sauce... \n", + "1 Chinese braised pork belly with shiitake mushr... \n", + "\n", + " rephraser_semantic_similarity gt_sources_extracted llm_response_sources \n", + "0 0.686584 [1] [1] \n", + "1 0.731325 [1] [1] " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# SOURCE SCORE CALCULATION\n", + "\n", + "def source_score(llm_sources, gt_sources):\n", + " \"\"\"\n", + " Calculates a source score by comparing the sources provided by an LLM\n", + " to the ground truth (GT) sources.\n", + "\n", + " The score is based on the number of correctly identified sources, with penalties\n", + " for incorrect LLM-provided sources and missed ground truth sources.\n", + "\n", + " Args:\n", + " llm_sources (list): A list of sources provided by the LLM.\n", + " gt_sources (list): A list of ground truth sources.\n", + "\n", + " Returns:\n", + " tuple: A tuple containing the following five float values:\n", + " - source_score: The calculated source score (capped at 0).\n", + " - points_per_source: The base points awarded for each correct ground truth source.\n", + " - correct_sources_score: The total score from correctly identified sources\n", + " before penalties.\n", + " - incorrect_source_penalty: The penalty applied for incorrect sources\n", + " provided by the LLM.\n", + " - missed_source_penalty: The penalty applied for ground truth sources\n", + " that were not identified by the LLM.\n", + " \"\"\"\n", + "\n", + " if not llm_sources or not gt_sources:\n", + " return 0, 0, 0, 0, 0\n", + "\n", + " # SOURCE SCORE CALCULATION\n", + "\n", + " # Calculate points per source\n", + " if len(gt_sources) == 0:\n", + " points_per_source = 0.00\n", + " else:\n", + " points_per_source = round(1 / len(gt_sources), 2)\n", + "\n", + " # Count correct and incorrect matches using set operation\n", + " correct_sources = len(set(gt_sources) & set(llm_sources))\n", + " incorrect_llm_sources = len(set(llm_sources) - set(gt_sources)) #finds the values that are in llm_sources but NOT in gt_sources\n", + " missed_gt_sources = len(set(gt_sources) - set(llm_sources)) #finds the values that are in gt_sources but NOT in llm_sources\n", + "\n", + " # Define penalty values (adjust penalty % as needed)\n", + " incorrect_source_penalty = 0.10 * points_per_source * incorrect_llm_sources # Per incorrect LLM source\n", + " missed_source_penalty = 0.20 * points_per_source * missed_gt_sources # Per missed GT source\n", + " correct_sources_score = correct_sources * points_per_source\n", + "\n", + " # Calculate the score and ensure it doesn't go below zero\n", + " source_score = round(max(0, (correct_sources_score -\n", + " incorrect_source_penalty -\n", + " missed_source_penalty)), 2)\n", + "\n", + " return source_score, points_per_source, correct_sources_score, incorrect_source_penalty, missed_source_penalty" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df[['source_score', 'points_per_source', 'correct_sources_score', 'incorrect_source_penalty', 'missed_source_penalty']] = df.apply(\n", + "lambda row: source_score(row['llm_response_sources'], row['gt_sources_extracted']), axis=1, result_type=\"expand\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "#### Ingredients and Instruction Scoring" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def extract_instructions_regex(text: str) -> str:\n", + " # Regex to find text after \"Instructions:\"\n", + " match = re.search(r\"instructions?\\s*(.*)\", text, re.IGNORECASE | re.DOTALL)\n", + " if match:\n", + " return match.group(1).strip().replace(\"{\", \"\").replace(\"}\", \"\").replace(\"•\", \"\").replace(\":\", \"\")\n", + " return \"\"\n", + "\n", + "def extract_ingredients_regex(text):\n", + " # Regex to find text between \"Ingredients\" and \"Instructions\"\n", + " match = re.search(r\"ingredients?\\s*(.*?)(?:\\s*[\\\"'\\n]*\\s*instructions:?)\", text, re.IGNORECASE | re.DOTALL)\n", + " if match:\n", + " return match.group(1).strip().replace(\"{\", \"\").replace(\"}\", \"\").replace(\"•\", \"\").replace(\":\", \"\")\n", + " return \"\"\n", + "\n", + "def extract_ingredients_and_instructions(llm_text, gt_text, llm_sources, gt_sources):\n", + " \"\"\"\n", + " Extracts ingredients and instructions from LLM-generated text and ground truth text,\n", + " focusing on the content associated with correctly identified sources.\n", + "\n", + " The function first identifies the common sources between the LLM's output and the\n", + " ground truth. Then, it extracts the full ingredients and instructions blocks from\n", + " both the LLM text and the ground truth text using regular expressions\n", + " (via `extract_ingredients_regex` and `extract_instructions_regex`).\n", + " The extracted ingredient and instruction blocks are then duplicated for each\n", + " correctly identified source, creating parallel lists.\n", + "\n", + " Args:\n", + " llm_text (str): The text generated by the Language Model.\n", + " gt_text (str): The ground truth text.\n", + " llm_sources (List[str]): A list of sources cited by the LLM.\n", + " gt_sources (List[str]): A list of ground truth sources.\n", + "\n", + " Returns:\n", + " Tuple[List[str], List[str], List[str], List[str]]: A tuple containing four lists:\n", + " - llm_ingredients_text (List[str])\n", + " - llm_instructions_text (List[str])\n", + " - gt_ingredients_text (List[str])\n", + " - gt_instructions_text (List[str])\n", + " Returns four empty lists if any of the input texts or source lists are empty.\n", + " \"\"\"\n", + "\n", + " if not llm_text or not llm_sources or not gt_sources:\n", + " return [], [], [], []\n", + "\n", + " correct_sources = sorted(list(set(gt_sources) & set(llm_sources)))\n", + "\n", + " llm_ingredients_text = []\n", + " gt_ingredients_text = []\n", + " \n", + " llm_instructions_text = []\n", + " gt_instructions_text = []\n", + "\n", + " # --- Process LLM Text ---\n", + " llm_ingredients_block = extract_ingredients_regex(llm_text)\n", + " if not llm_ingredients_block:\n", + " llm_ingredients_block = \"\"\n", + " llm_instructions_block = extract_instructions_regex(llm_text)\n", + " if not llm_instructions_block:\n", + " llm_instructions_text = \"\"\n", + "\n", + " # --- Process GT Text ---\n", + " gt_ingredients_block = extract_ingredients_regex(gt_text)\n", + " if not gt_ingredients_block:\n", + " gt_ingredients_block = \"\"\n", + " gt_instructions_block = extract_instructions_regex(gt_text)\n", + " if not gt_instructions_block:\n", + " gt_instructions_block = \"\"\n", + "\n", + " for source in correct_sources:\n", + " llm_ingredients_text.append(llm_ingredients_block)\n", + " llm_instructions_text.append(llm_instructions_block)\n", + " \n", + " gt_ingredients_text.append(gt_ingredients_block)\n", + " gt_instructions_text.append(gt_instructions_block)\n", + "\n", + " return llm_ingredients_text, llm_instructions_text, gt_ingredients_text, gt_instructions_text" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df[['llm_ingredients_text', 'llm_instructions_text', 'gt_ingredients_text', 'gt_instructions_text']] = df.apply(\n", + " lambda row: extract_ingredients_and_instructions(row['gemini_final_answer_llm'], row['ground_truth_final_answer'], row['llm_response_sources'],\n", + " row['gt_sources_extracted']), axis=1, result_type=\"expand\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Converting each list into a single string joined by a newline character and store it in a new column\n", + "\n", + "df['gt_ingredients_text_split'] = df['gt_ingredients_text'].apply(\n", + "lambda item: \"\\n\".join(item) if isinstance(item, list) else \"\")\n", + "\n", + "df['gt_instructions_text_split'] = df['gt_instructions_text'].apply(\n", + "lambda item: \"\\n\".join(item) if isinstance(item, list) else \"\")\n", + "\n", + "df['llm_ingredients_text_split'] = df['llm_ingredients_text'].apply(\n", + " lambda item: \"\\n\".join(item) if isinstance(item, list) else \"\")\n", + "\n", + "df['llm_instructions_text_split'] = df['llm_instructions_text'].apply(\n", + " lambda item: \"\\n\".join(item) if isinstance(item, list) else \"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Function to produce Ingredients & Instructions Score\n", + "\n", + "def evaluate_content(llm_text, gt_text, casenum, threshold_in):\n", + " \"\"\"\n", + " Evaluates the semantic similarity between sentences in LLM-generated text and\n", + " ground truth text to produce a content score.\n", + "\n", + " The function splits both texts into sentences, encodes them using a SentenceTransformer model,\n", + " and calculates cosine similarity between all pairs of LLM and ground truth sentences.\n", + " It then matches sentences based on a provided similarity threshold and calculates a score\n", + " that rewards matched sentences and penalizes extra LLM-generated sentences and\n", + " missing ground truth sentences. Transition words (defined in `ignore_words`) are\n", + " excluded from penalty calculations.\n", + "\n", + " Args:\n", + " llm_text (str): The text generated by the Language Model.\n", + " gt_text (str): The ground truth text.\n", + " casenum (str): A unique identifier for the case being evaluated. This is\n", + " included in the output DataFrame.\n", + " threshold_in (float): The cosine similarity threshold (between 0 and 1)\n", + " above which an LLM sentence is considered a match for a ground truth sentence.\n", + "\n", + " Returns:\n", + " pandas.DataFrame: A DataFrame with a single row containing the evaluation metrics:\n", + " - 'case_id': The provided `casenum`.\n", + " - 'sentences_score': The overall content score (between 0 and 2).\n", + " - 'num_gt_sentences': The total number of sentences in the ground truth text.\n", + " - 'points_per_sentence': The base points awarded per matched ground truth sentence.\n", + " - 'num_llm_sentences': The total number of sentences in the LLM-generated text.\n", + " - 'correct_sentences_score': The total score from correctly matched sentences.\n", + " - 'extra_sentences_penalty': The penalty applied for extra sentences in the LLM text.\n", + " - 'gt_not_in_llm_penalty': The penalty applied for ground truth sentences not found in the LLM text.\n", + " - 'num_equal_sentences': The number of exactly matching sentences (case-sensitive).\n", + " - 'matched_sentences_count': The number of ground truth sentences with a similarity above the threshold in the LLM text.\n", + " - 'gt_not_in_llm_count': The number of ground truth sentences with no similar counterpart in the LLM text (above the threshold, excluding transition sentences).\n", + " - 'extra_sentences_count': The number of LLM sentences with no similar counterpart in the ground truth text (above the threshold, excluding transition sentences).\n", + " - 'gt_transition_sentence_count': The number of ground truth sentences identified as transition sentences (based on `ignore_words`).\n", + " - 'llm_transition_sentence_count': The number of LLM sentences identified as transition sentences.\n", + " - 'gt_sentences': The ground truth text with sentences separated by newlines.\n", + " - 'llm_sentences': The LLM-generated text with sentences separated by newlines.\n", + " - 'matched_sentences': Pairs of matched ground truth and LLM sentences, each pair separated by a newline.\n", + " - 'extra_sentences': LLM sentences that did not meet the similarity threshold with any ground truth sentence.\n", + " - 'gt_not_in_llm_sentences': Ground truth sentences that did not meet the similarity threshold with any LLM sentence.\n", + "\n", + " Raises:\n", + " Exception: If any error occurs during the evaluation process, the error is printed,\n", + " and a DataFrame with default zero/empty values is returned.\n", + " \"\"\"\n", + " \n", + " model = SentenceTransformer('all-mpnet-base-v2')\n", + " threshold = threshold_in\n", + "\n", + " try:\n", + " if len(llm_text) > 0 and len(gt_text) > 0 and llm_text != \"None\" and gt_text != \"None\":\n", + " # Split ground truth and LLM text into sentences\n", + " gt_sentences = gt_text.split(\"\\n\")\n", + " gt_sentences = [sent.strip() for sent in gt_sentences if sent.strip()]\n", + "\n", + " llm_sentences = llm_text.split(\"\\n\")\n", + " llm_sentences = [sent.strip() for sent in llm_sentences if sent.strip()]\n", + "\n", + " # Encode LLM and ground truth sentences\n", + " llm_embeddings = model.encode(llm_sentences)\n", + " gt_embeddings = model.encode(gt_sentences)\n", + "\n", + " # Calculate similarity for each LLM sentence to all ground truth sentences\n", + " similarities = util.cos_sim(llm_embeddings, gt_embeddings)\n", + "\n", + " # Initialize lists to store matched and extra sentences\n", + " matched_sentences = []\n", + " extra_sentences = []\n", + " gt_not_in_llm_sentences = []\n", + "\n", + " # Count sentences with high similarity and penalize for extra sentences\n", + " matched_sentences_count = 0\n", + " extra_sentences_count = 0\n", + " gt_not_in_llm_count = 0\n", + " gt_transition_sentence_count = 0\n", + " llm_transition_sentence_count = 0\n", + "\n", + " # To ignore transition sentences on penalty calculation\n", + " ignore_words = [\",\", \"\\\"\"]\n", + " ignore_pattern = re.compile(r\"|\".join(ignore_words), re.IGNORECASE) # Case-insensitive pattern\n", + "\n", + " for gt_index, gt_sentence in enumerate(gt_sentences):\n", + " max_sim_index = np.argmax(similarities[:, gt_index])\n", + " if similarities[max_sim_index, gt_index] > threshold:\n", + " matched_sentences_count += 1\n", + " matched_pair = (\n", + " f\"{gt_sentence}\\n\",\n", + " f\"{llm_sentences[max_sim_index]}\\n\"\n", + " )\n", + " matched_sentences.append(matched_pair)\n", + " else:\n", + " if ignore_pattern.search(gt_sentence):\n", + " gt_transition_sentence_count += 1\n", + " else:\n", + " gt_not_in_llm_count += 1\n", + " gt_not_in_llm_sentences.append(gt_sentence + \"\\n\")\n", + "\n", + " for i, llm_similarities in enumerate(similarities):\n", + " if max(llm_similarities) < threshold:\n", + " if ignore_pattern.search(llm_sentences[i]):\n", + " llm_transition_sentence_count += 1\n", + " else:\n", + " extra_sentences_count += 1\n", + " extra_sentences.append(llm_sentences[i] + \"\\n\")\n", + "\n", + "\n", + " if (len(gt_sentences) - gt_transition_sentence_count) == 0:\n", + " points_per_sentence = 0.00\n", + " else:\n", + " points_per_sentence = round(2 / (len(gt_sentences) - gt_transition_sentence_count), 4)\n", + "\n", + " # Calculate the score\n", + " correct_sentences_score = matched_sentences_count * points_per_sentence\n", + " extra_sentences_penalty = extra_sentences_count * 0.10 * points_per_sentence # Apply penalty for extra sentences\n", + " gt_not_in_llm_penalty = gt_not_in_llm_count * 0.20 * points_per_sentence # Apply penalty for sentences in GT not in LLM\n", + "\n", + " # Store number of sentences\n", + " num_gt_sentences = len(gt_sentences)\n", + " num_llm_sentences = len(llm_sentences)\n", + "\n", + " # Count the number of exactly matching sentences\n", + " num_equal_sentences = sum(1 for gt, llm in matched_sentences if gt.strip() == llm.strip())\n", + "\n", + " score = round(max(0, min(2, correct_sentences_score - extra_sentences_penalty - gt_not_in_llm_penalty)), 3)\n", + "\n", + " else:\n", + " points_per_sentence = 0\n", + " score = 0\n", + " gt_sentences = []\n", + " llm_sentences = []\n", + " correct_sentences_score = 0\n", + " extra_sentences_penalty = 0\n", + " num_gt_sentences = 0\n", + " num_llm_sentences = 0\n", + " matched_sentences = []\n", + " extra_sentences = []\n", + " num_equal_sentences = 0\n", + " gt_not_in_llm_sentences = []\n", + " gt_not_in_llm_count = 0\n", + " gt_not_in_llm_penalty = 0\n", + " matched_sentences_count = 0\n", + " extra_sentences_count = 0\n", + " gt_transition_sentence_count = 0\n", + " llm_transition_sentence_count = 0\n", + "\n", + " except Exception as e: # Broad exception handling to catch any errors\n", + " print(f\"Error in case {casenum}: {e}\") # Log the error for debugging\n", + "\n", + " # Return empty strings/zeros for all columns\n", + " data = {\n", + " 'Case Number': casenum,\n", + " 'sentences_score': 0,\n", + " 'num_gt_sentences': 0,\n", + " 'points_per_sentence': 0,\n", + " 'num_llm_sentences': 0,\n", + " 'correct_sentences_score': 0,\n", + " 'extra_sentences_penalty': 0,\n", + " 'gt_not_in_llm_penalty': 0,\n", + " 'num_equal_sentences': 0,\n", + " 'matched_sentences_count': 0,\n", + " 'gt_not_in_llm_count': 0,\n", + " 'extra_sentences_count': 0,\n", + " 'gt_transition_sentence_count': 0,\n", + " 'llm_transition_sentence_count': 0,\n", + " 'gt_sentences': \"\",\n", + " 'llm_sentences': \"\",\n", + " 'matched_sentences': \"\",\n", + " 'extra_sentences': \"\",\n", + " 'gt_not_in_llm_sentences': \"\"\n", + " }\n", + " return pd.DataFrame(data, index=[0])\n", + "\n", + " data = {\n", + " # MAKE SURE YOU'RE GIVING 'casenum' THE SAME NAME AS in YOUR 'df' AS YOU WILL BE MERGING LATER\n", + " 'case_id': casenum,\n", + " 'sentences_score': max(0, min(2, score)),\n", + " 'num_gt_sentences': num_gt_sentences,\n", + " 'points_per_sentence': points_per_sentence,\n", + " 'num_llm_sentences': num_llm_sentences,\n", + " 'correct_sentences_score': correct_sentences_score,\n", + " 'extra_sentences_penalty': extra_sentences_penalty,\n", + " 'gt_not_in_llm_penalty': gt_not_in_llm_penalty,\n", + " 'num_equal_sentences': num_equal_sentences,\n", + " 'matched_sentences_count': matched_sentences_count,\n", + " 'gt_not_in_llm_count': gt_not_in_llm_count,\n", + " 'extra_sentences_count': extra_sentences_count,\n", + " 'gt_transition_sentence_count' : gt_transition_sentence_count,\n", + " 'llm_transition_sentence_count' : llm_transition_sentence_count,\n", + "\n", + " # Join list elements into a single string\n", + " 'gt_sentences': \"\\n\".join(gt_sentences),\n", + " 'llm_sentences': \"\\n\".join(llm_sentences),\n", + " 'matched_sentences': \"\\n\".join([f\"{gt}\\n{llm}\" for gt, llm in matched_sentences]),\n", + " 'extra_sentences': \"\\n\".join(extra_sentences),\n", + " 'gt_not_in_llm_sentences': \"\\n\".join(gt_not_in_llm_sentences)\n", + " }\n", + "\n", + " return pd.DataFrame(data, index=[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Semantic Similarity Threshold default value is 85%\n", + "\n", + "results_ingredients = df.apply(\n", + "lambda row: evaluate_content(row['llm_ingredients_text_split'], row['gt_ingredients_text_split'], row['case_id'], 0.85), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Semantic Similarity Threshold default value is 75%\n", + "\n", + "results_instructions = df.apply(\n", + "lambda row: evaluate_content(row['llm_instructions_text_split'], row['gt_instructions_text_split'], row['case_id'], 0.75), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df_ingredients = pd.concat(results_ingredients.tolist(), ignore_index=True)\n", + "df_instructions = pd.concat(results_instructions.tolist(), ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = df.merge(df_ingredients, on='case_id', how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Identify potential overlapping columns (excluding the join key)\n", + "overlapping_cols_ingredients = list(set(df.columns) & set(df_ingredients.columns) - {'case_id'})\n", + "overlapping_cols_instructions = list(set(df.columns) & set(df_instructions.columns) - {'case_id'})\n", + "\n", + "# Rename overlapping columns in df_ingredients\n", + "df_ingredients = df_ingredients.rename(columns={col: f\"{col}_ingredient\" for col in overlapping_cols_ingredients})\n", + "\n", + "# Rename overlapping columns in df_instructions\n", + "df_instructions = df_instructions.rename(columns={col: f\"{col}_instruction\" for col in overlapping_cols_instructions})\n", + "\n", + "# Perform the merges\n", + "final_df = df.merge(df_ingredients, on='case_id', how='left')\n", + "final_df = final_df.merge(df_instructions, on='case_id', how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Dropping duplicate columns\n", + "\n", + "columns_to_drop = ['sentences_score', 'num_gt_sentences', 'points_per_sentence', 'num_llm_sentences', 'correct_sentences_score', 'extra_sentences_penalty', 'gt_not_in_llm_penalty', 'num_equal_sentences', 'matched_sentences_count', 'gt_not_in_llm_count', 'extra_sentences_count', 'gt_transition_sentence_count', 'llm_transition_sentence_count', 'gt_sentences', 'llm_sentences', 'matched_sentences', 'extra_sentences', 'gt_not_in_llm_sentences']\n", + "\n", + "final_df.drop(columns=columns_to_drop, axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "final_df['final_score'] = final_df['source_score'] + final_df['sentences_score_ingredient'] + final_df['sentences_score_instruction']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "#### Edge case for \"No Answer Available\"" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def no_answer_case(df, gt_col, llm_col, similarity_threshold=0.2):\n", + " \"\"\"\n", + " Identifies and scores cases where both the ground truth and the LLM indicate\n", + " an inability to answer a question.\n", + "\n", + " This function iterates through a DataFrame, comparing the answers in a ground\n", + " truth column and an LLM output column. It checks if both answers express\n", + " a lack of information using either semantic similarity to the phrase\n", + " \"No answer to this question\" (above a specified threshold) or by containing\n", + " predefined \"no answer\" keywords. If both conditions are met for a row,\n", + " specific score columns ('source_score', 'sentences_score_ingredient',\n", + " 'sentences_score_instruction', 'final_score') in that row are set to predefined\n", + " positive values, indicating a correctly identified \"no answer\" scenario.\n", + "\n", + " Args:\n", + " df (pd.DataFrame): The DataFrame containing the ground truth and LLM answers.\n", + " gt_col (str): The name of the column containing the ground truth answers.\n", + " llm_col (str): The name of the column containing the LLM-generated answers.\n", + " similarity_threshold (float, optional): The minimum cosine similarity\n", + " (between 0 and 1) to the phrase \"No answer to this question\" for an\n", + " answer to be considered a \"no answer\" case based on semantic similarity.\n", + "\n", + " Returns:\n", + " pd.DataFrame: The input DataFrame with updated score values for rows where\n", + " both ground truth and LLM indicate an inability to answer.\n", + " Specific score columns are set to 1, 2, 2, and 5 respectively\n", + " for these \"no answer\" cases.\n", + "\n", + " Raises:\n", + " KeyError: If the specified `gt_col` or `llm_col` are not found in the DataFrame.\n", + " Exception: For any other unexpected error during row processing, an error\n", + " message is printed, and the row is skipped.\n", + " \"\"\"\n", + " \n", + " \n", + " target_phrase = \"No answer to this question\"\n", + " no_answer_keywords = [\"no answer\", \"cannot answer\", \"not able to answer\",\n", + " \"not able to\", \"not found\", \"not available\",\n", + " \"no relevant information\", \"does not contain\",\n", + " \"unable to find\"]\n", + "\n", + " for index, row in df.iterrows():\n", + " try:\n", + " gt_answer = str(row[gt_col]).strip().lower()\n", + " llm_answer = str(row[llm_col]).strip().lower()\n", + "\n", + " gt_similarity = calculate_similarity(gt_answer, target_phrase)\n", + " llm_similarity = calculate_similarity(llm_answer, target_phrase)\n", + "\n", + " gt_has_keyword = any(keyword in gt_answer for keyword in no_answer_keywords)\n", + " llm_has_keyword = any(keyword in llm_answer for keyword in no_answer_keywords)\n", + "\n", + " # print(f\"Index: {index}, GT Similarity: {gt_similarity:.4f}, LLM Similarity: {llm_similarity:.4f}, GT Keywords: {gt_has_keyword}, LLM Keywords: {llm_has_keyword}\")\n", + "\n", + " if (gt_similarity >= similarity_threshold and llm_similarity >= similarity_threshold) or (gt_has_keyword and llm_has_keyword):\n", + " df.loc[index, 'source_score'] = 1\n", + " df.loc[index, 'sentences_score_ingredient'] = 2\n", + " df.loc[index, 'sentences_score_instruction'] = 2\n", + " df.loc[index, 'final_score'] = 5\n", + " except KeyError as e:\n", + " print(f\"Error: Column '{e}' not found in the DataFrame for row {index}. Skipping row.\")\n", + " except Exception as e:\n", + " print(f\"An unexpected error occurred while processing row {index}: {e}\")\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "### Final Result" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
case_idqueryground_truth_rephrased_querygemini_rephrased_query_llmground_truth_final_answergemini_final_answer_llmgemini_alternative_answergemini_alternative_rephrased_query_llmrephraser_semantic_similaritygt_sources_extracted...gt_not_in_llm_count_instructionextra_sentences_count_instructiongt_transition_sentence_count_instructionllm_transition_sentence_count_instructiongt_sentences_instructionllm_sentences_instructionmatched_sentences_instructionextra_sentences_instructiongt_not_in_llm_sentences_instructionfinal_score
01I have a lot of tofu and I want to make a Chin...Sweet sour tofu bell pepper onion Chinese stir...Sweet and sour tofu stir-fry with bell peppers...Recipe 1: Mapo Tofu\\n\\nIngredients:\\n\\n½ cup o...Recipe 1: (From Document 1). Best Dish: Mapo T...Okay, I understand. I will analyze the search ...Tofu stir-fry recipe with sweet and sour sauce...0.686584[1]...0000On a medium heat, toast the chilies. Add ¼ cup...1. On a medium heat, toast the chilies. Add ¼ ...On a medium heat, toast the chilies. Add ¼ cup...5.000
12I have some dried shiitake mushrooms and I'd l...Shiitake mushroom pork belly star anise braise...Shiitake mushroom, pork belly, star anise, soy...Best Dish: Braised Pork and Egg with Rice\\n\\nR...Recipe 1: Braised Pork and Egg with Rice\\n\\nIn...Recipe 1: SHANGHAI BRAISED PORK BELLY\\n\\nIngre...Chinese braised pork belly with shiitake mushr...0.731325[1]...0000Rinse mushrooms and soak in hot water until so...Rinse mushrooms and soak in hot water until so...Rinse mushrooms and soak in hot water until so...4.999
23I'm looking for a Cantonese recipe that uses c...Chicken feet black bean sauce steamed Cantones...Cantonese steamed chicken feet with black bean...No answer to this questionI am not able to answer this question. None of...Based on your query for a Cantonese chicken fe...Chicken feet recipe, Cantonese, black bean sau...0.921332[]...00005.000
34I'm looking for a simple Chinese stir-fry reci...Beef and broccoli. quick to makeQuick beef and broccoli stir-fry with light sa...Recipe 1: Beef and Broccoli\\n\\nIngredients:\\n\\...Recipe 1: Beef and Broccoli\\n\\nIngredients:\\n\\...Recipe 1: Beef and Broccoli\\n\\nIngredients: 1¼...Beef and broccoli stir-fry recipe with light, ...0.645821[1]...0000Place the water and cornstarch/cornflour into ...Place the water and cornstarch/cornflour into ...Place the water and cornstarch/cornflour into ...5.000
45I'm looking for an Indonesian dish with beef. ...Spicy Indonesian dish with beef and ma la flavor.Spicy Indonesian beef with ma la and dried chi...Recipe 1: Beef Rendang\\n\\nIngredients:\\n\\nSpic...Recipe 1: Beef Rendang\\n\\nIngredients:\\n\\nSpic...The customer is looking for a spicy, numbing I...Spicy, numbing Indonesian beef dish with dried...0.667477[1]...0000Puree Spice paste in a food processor until fi...Puree Spice paste in a food processor until fi...Puree Spice paste in a food processor until fi...5.000
56I have some preserved duck eggs, and I'd like ...Preserved duck egg ginger soy sauce cold Chine...Preserved duck egg appetizer, ginger, soy sauce.Recipe 1: Silken Tofu with Century Egg\\n\\nIngr...Recipe 1: Silken Tofu with Century Egg\\n\\nIngr...Recipe 1: Silken Tofu with Century Egg\\n\\nIngr...Preserved duck egg cold appetizer with ginger ...0.545543[1]...0000Drain the tofu and arrange it on a plate with ...Drain the tofu and arrange it on a plate with ...Drain the tofu and arrange it on a plate with ...5.000
67I'm interested in a dish that uses stuffed egg...Stuffed eggplant with ground pork and dried sh...Savory stuffed eggplant with ground pork and d...Recipe 1: Stir Fried Eggplant with Minced Pork...Document 1 contains a recipe that closely matc...Recipe 1: Stir fried eggplant with minced pork...Stuffed eggplant recipes with ground pork, sav...0.690133[1]...0000Mix sesame oil, corn flour, egg whites, shaoxi...Mix sesame oil, corn flour, egg whites, shaoxi...Mix sesame oil, corn flour, egg whites, shaoxi...4.999
78I'm looking for a vegetarian dish that's inspi...Spicy Indian vegetarian soup with rice recipeSpicy Indian vegetable soup with rice.Recipe 1: Rasam\\n\\nIngredients:\\n\\nTamarind (s...Recipe 1: Rasam\\n\\nIngredients:\\n\\nTamarind (s...The customer is looking for a spicy, Indian-in...Spicy Indian-inspired vegetarian soup with veg...0.731718[1]...0001Soak the tamarind in hot water for 30 mins the...Soak the tamarind in hot water for 30 mins the...Soak the tamarind in hot water for 30 mins the...4.899
89I have some fresh salmon and I want to make a ...Japanese grilled salmon rice dish with cucumberSalmon and rice dish with cucumber, Japanese s...Recipe 1: JAPANESE SALMON FLAKE ONIGIRI (RICE ...Recipe 1: JAPANESE SALMON FLAKE ONIGIRI (RICE ...Neither of the provided documents offers a rec...Salmon rice bowl with cucumber, Japanese style.0.720132[1]...0500)\\nInstructions\\nSeason salmon with salt. Gril...)\\nInstructions\\nSeason salmon with salt. Gril...)\\n\\n)\\n\\nInstructions\\n\\nInstructions\\n\\nSeas...Recipe 2 JAPANESE TEMAKI (CONE STYLE SUSHI)\\n\\...4.775
910I'm trying to find a traditional Chinese recip...Braised chicken wood ear mushroom ginger scall...Braised chicken, dark sauce, wood ear mushroom...Recipe 1: Sweet potato, tea tree mushroom &\\nc...Recipe 1: Sweet potato, tea tree mushroom &\\nc...Recipe 1: Sweet potato, tea tree mushroom & ch...Chicken recipe, dark sauce, braised or slow-co...0.679397[1]...00001. Clean the dried tea tree mushroom and soak ...1. Clean the dried tea tree mushroom and soak ...1. Clean the dried tea tree mushroom and soak ...4.999
\n", + "

10 rows × 61 columns

\n", + "
" + ], + "text/plain": [ + " case_id query \\\n", + "0 1 I have a lot of tofu and I want to make a Chin... \n", + "1 2 I have some dried shiitake mushrooms and I'd l... \n", + "2 3 I'm looking for a Cantonese recipe that uses c... \n", + "3 4 I'm looking for a simple Chinese stir-fry reci... \n", + "4 5 I'm looking for an Indonesian dish with beef. ... \n", + "5 6 I have some preserved duck eggs, and I'd like ... \n", + "6 7 I'm interested in a dish that uses stuffed egg... \n", + "7 8 I'm looking for a vegetarian dish that's inspi... \n", + "8 9 I have some fresh salmon and I want to make a ... \n", + "9 10 I'm trying to find a traditional Chinese recip... \n", + "\n", + " ground_truth_rephrased_query \\\n", + "0 Sweet sour tofu bell pepper onion Chinese stir... \n", + "1 Shiitake mushroom pork belly star anise braise... \n", + "2 Chicken feet black bean sauce steamed Cantones... \n", + "3 Beef and broccoli. quick to make \n", + "4 Spicy Indonesian dish with beef and ma la flavor. \n", + "5 Preserved duck egg ginger soy sauce cold Chine... \n", + "6 Stuffed eggplant with ground pork and dried sh... \n", + "7 Spicy Indian vegetarian soup with rice recipe \n", + "8 Japanese grilled salmon rice dish with cucumber \n", + "9 Braised chicken wood ear mushroom ginger scall... \n", + "\n", + " gemini_rephrased_query_llm \\\n", + "0 Sweet and sour tofu stir-fry with bell peppers... \n", + "1 Shiitake mushroom, pork belly, star anise, soy... \n", + "2 Cantonese steamed chicken feet with black bean... \n", + "3 Quick beef and broccoli stir-fry with light sa... \n", + "4 Spicy Indonesian beef with ma la and dried chi... \n", + "5 Preserved duck egg appetizer, ginger, soy sauce. \n", + "6 Savory stuffed eggplant with ground pork and d... \n", + "7 Spicy Indian vegetable soup with rice. \n", + "8 Salmon and rice dish with cucumber, Japanese s... \n", + "9 Braised chicken, dark sauce, wood ear mushroom... \n", + "\n", + " ground_truth_final_answer \\\n", + "0 Recipe 1: Mapo Tofu\\n\\nIngredients:\\n\\n½ cup o... \n", + "1 Best Dish: Braised Pork and Egg with Rice\\n\\nR... \n", + "2 No answer to this question \n", + "3 Recipe 1: Beef and Broccoli\\n\\nIngredients:\\n\\... \n", + "4 Recipe 1: Beef Rendang\\n\\nIngredients:\\n\\nSpic... \n", + "5 Recipe 1: Silken Tofu with Century Egg\\n\\nIngr... \n", + "6 Recipe 1: Stir Fried Eggplant with Minced Pork... \n", + "7 Recipe 1: Rasam\\n\\nIngredients:\\n\\nTamarind (s... \n", + "8 Recipe 1: JAPANESE SALMON FLAKE ONIGIRI (RICE ... \n", + "9 Recipe 1: Sweet potato, tea tree mushroom &\\nc... \n", + "\n", + " gemini_final_answer_llm \\\n", + "0 Recipe 1: (From Document 1). Best Dish: Mapo T... \n", + "1 Recipe 1: Braised Pork and Egg with Rice\\n\\nIn... \n", + "2 I am not able to answer this question. None of... \n", + "3 Recipe 1: Beef and Broccoli\\n\\nIngredients:\\n\\... \n", + "4 Recipe 1: Beef Rendang\\n\\nIngredients:\\n\\nSpic... \n", + "5 Recipe 1: Silken Tofu with Century Egg\\n\\nIngr... \n", + "6 Document 1 contains a recipe that closely matc... \n", + "7 Recipe 1: Rasam\\n\\nIngredients:\\n\\nTamarind (s... \n", + "8 Recipe 1: JAPANESE SALMON FLAKE ONIGIRI (RICE ... \n", + "9 Recipe 1: Sweet potato, tea tree mushroom &\\nc... \n", + "\n", + " gemini_alternative_answer \\\n", + "0 Okay, I understand. I will analyze the search ... \n", + "1 Recipe 1: SHANGHAI BRAISED PORK BELLY\\n\\nIngre... \n", + "2 Based on your query for a Cantonese chicken fe... \n", + "3 Recipe 1: Beef and Broccoli\\n\\nIngredients: 1¼... \n", + "4 The customer is looking for a spicy, numbing I... \n", + "5 Recipe 1: Silken Tofu with Century Egg\\n\\nIngr... \n", + "6 Recipe 1: Stir fried eggplant with minced pork... \n", + "7 The customer is looking for a spicy, Indian-in... \n", + "8 Neither of the provided documents offers a rec... \n", + "9 Recipe 1: Sweet potato, tea tree mushroom & ch... \n", + "\n", + " gemini_alternative_rephrased_query_llm \\\n", + "0 Tofu stir-fry recipe with sweet and sour sauce... \n", + "1 Chinese braised pork belly with shiitake mushr... \n", + "2 Chicken feet recipe, Cantonese, black bean sau... \n", + "3 Beef and broccoli stir-fry recipe with light, ... \n", + "4 Spicy, numbing Indonesian beef dish with dried... \n", + "5 Preserved duck egg cold appetizer with ginger ... \n", + "6 Stuffed eggplant recipes with ground pork, sav... \n", + "7 Spicy Indian-inspired vegetarian soup with veg... \n", + "8 Salmon rice bowl with cucumber, Japanese style. \n", + "9 Chicken recipe, dark sauce, braised or slow-co... \n", + "\n", + " rephraser_semantic_similarity gt_sources_extracted ... \\\n", + "0 0.686584 [1] ... \n", + "1 0.731325 [1] ... \n", + "2 0.921332 [] ... \n", + "3 0.645821 [1] ... \n", + "4 0.667477 [1] ... \n", + "5 0.545543 [1] ... \n", + "6 0.690133 [1] ... \n", + "7 0.731718 [1] ... \n", + "8 0.720132 [1] ... \n", + "9 0.679397 [1] ... \n", + "\n", + " gt_not_in_llm_count_instruction extra_sentences_count_instruction \\\n", + "0 0 0 \n", + "1 0 0 \n", + "2 0 0 \n", + "3 0 0 \n", + "4 0 0 \n", + "5 0 0 \n", + "6 0 0 \n", + "7 0 0 \n", + "8 0 5 \n", + "9 0 0 \n", + "\n", + " gt_transition_sentence_count_instruction \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "5 0 \n", + "6 0 \n", + "7 0 \n", + "8 0 \n", + "9 0 \n", + "\n", + " llm_transition_sentence_count_instruction \\\n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "5 0 \n", + "6 0 \n", + "7 1 \n", + "8 0 \n", + "9 0 \n", + "\n", + " gt_sentences_instruction \\\n", + "0 On a medium heat, toast the chilies. Add ¼ cup... \n", + "1 Rinse mushrooms and soak in hot water until so... \n", + "2 \n", + "3 Place the water and cornstarch/cornflour into ... \n", + "4 Puree Spice paste in a food processor until fi... \n", + "5 Drain the tofu and arrange it on a plate with ... \n", + "6 Mix sesame oil, corn flour, egg whites, shaoxi... \n", + "7 Soak the tamarind in hot water for 30 mins the... \n", + "8 )\\nInstructions\\nSeason salmon with salt. Gril... \n", + "9 1. Clean the dried tea tree mushroom and soak ... \n", + "\n", + " llm_sentences_instruction \\\n", + "0 1. On a medium heat, toast the chilies. Add ¼ ... \n", + "1 Rinse mushrooms and soak in hot water until so... \n", + "2 \n", + "3 Place the water and cornstarch/cornflour into ... \n", + "4 Puree Spice paste in a food processor until fi... \n", + "5 Drain the tofu and arrange it on a plate with ... \n", + "6 Mix sesame oil, corn flour, egg whites, shaoxi... \n", + "7 Soak the tamarind in hot water for 30 mins the... \n", + "8 )\\nInstructions\\nSeason salmon with salt. Gril... \n", + "9 1. Clean the dried tea tree mushroom and soak ... \n", + "\n", + " matched_sentences_instruction \\\n", + "0 On a medium heat, toast the chilies. Add ¼ cup... \n", + "1 Rinse mushrooms and soak in hot water until so... \n", + "2 \n", + "3 Place the water and cornstarch/cornflour into ... \n", + "4 Puree Spice paste in a food processor until fi... \n", + "5 Drain the tofu and arrange it on a plate with ... \n", + "6 Mix sesame oil, corn flour, egg whites, shaoxi... \n", + "7 Soak the tamarind in hot water for 30 mins the... \n", + "8 )\\n\\n)\\n\\nInstructions\\n\\nInstructions\\n\\nSeas... \n", + "9 1. Clean the dried tea tree mushroom and soak ... \n", + "\n", + " extra_sentences_instruction \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 \n", + "5 \n", + "6 \n", + "7 \n", + "8 Recipe 2 JAPANESE TEMAKI (CONE STYLE SUSHI)\\n\\... \n", + "9 \n", + "\n", + " gt_not_in_llm_sentences_instruction final_score \n", + "0 5.000 \n", + "1 4.999 \n", + "2 5.000 \n", + "3 5.000 \n", + "4 5.000 \n", + "5 5.000 \n", + "6 4.999 \n", + "7 4.899 \n", + "8 4.775 \n", + "9 4.999 \n", + "\n", + "[10 rows x 61 columns]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "no_answer_case(\n", + " df=final_df,\n", + " gt_col=\"ground_truth_final_answer\",\n", + " llm_col=\"gemini_final_answer_llm\",\n", + " similarity_threshold=0.5 # You can adjust the threshold if needed\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "final_df.to_csv('chef-advisor-llm-results.csv', index=False)" + ] + } + ], + "metadata": { + "colab": { + "name": "notebook_template.ipynb", + "toc_visible": true + }, + "environment": { + "kernel": "python3", + "name": "tf2-cpu.2-11.m123", + "type": "gcloud", + "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/tf2-cpu.2-11:m123" + }, + "kernelspec": { + "display_name": "Python 3 (Local)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}