Merge pull request #24 from cassiebreviu/main

add intent evaluation
Azure-Samples · Jan 4, 2024 · 8e2c5a1 · 8e2c5a1
2 parents 3e28196 + 1fda2bd
commit 8e2c5a1
Show file tree

Hide file tree

Showing 12 changed files with 466 additions and 6 deletions.
diff --git a/.github/workflows/run-chat-eval-pf-pipeline.yml b/.github/workflows/run-chat-eval-pf-pipeline.yml
@@ -67,7 +67,7 @@ jobs:
       id: jobMetricAssert
       run: |
             # NOTE The number after the file is the threshold score to pass the assertion.
-            export ASSERT=$(python deployment/llmops-helper/assert.py eval_result.json 3) # NOTE <file>.json is the file name and decimal is the threshold for the assertion
+            export ASSERT=$(python deployment/llmops-helper/assert.py eval_result.json 2) # NOTE <file>.json is the file name and decimal is the threshold for the assertion
             echo "::debug::Assert has returned the following value: $ASSERT"
             # assert.py will return True or False, but bash expects lowercase.
             if ${ASSERT,,} ; then

diff --git a/.github/workflows/run-support-eval-pf-pipeline.yml b/.github/workflows/run-support-eval-pf-pipeline.yml
@@ -67,7 +67,7 @@ jobs:
       id: jobMetricAssert
       run: |
             # NOTE The number after the file is the threshold score to pass the assertion.
-            export ASSERT=$(python deployment/llmops-helper/assert.py eval_result.json 3) # NOTE <file>.json is the file name and decimal is the threshold for the assertion
+            export ASSERT=$(python deployment/llmops-helper/assert.py eval_result.json 2) # NOTE <file>.json is the file name and decimal is the threshold for the assertion
             echo "::debug::Assert has returned the following value: $ASSERT"
             # assert.py will return True or False, but bash expects lowercase.
             if ${ASSERT,,} ; then

diff --git a/contoso-intent/flow.dag.yaml b/contoso-intent/flow.dag.yaml
@@ -1,5 +1,5 @@
-id: template_chat_flow
-name: Template Chat Flow
+id: intent_flow
+name: Intent Flow
 environment:
   python_requirements_txt: requirements.txt
 inputs:
@@ -18,6 +18,9 @@ outputs:
     type: string
     reference: ${run_chat_or_support.output}
     is_chat_output: true
+  intent_context:
+    type: string
+    reference: ${classify_intent_llm.output}
 nodes:
 - name: classify_intent_prompt
   type: prompt

diff --git a/data/intenttestdata.jsonl b/data/intenttestdata.jsonl
@@ -0,0 +1,12 @@
+{"customerId": "7", "chat_history": [], "question": "what is the temperature rating of my sleeping bag?", "intent": "support"}
+{"customerId": "7", "chat_history": [], "question": "what is the temperature rating of the cozynights sleeping bag?", "intent": "support"}
+{"customerId": "8", "chat_history": [], "question": "what is the waterproof rating of the tent I bought?", "intent": "support"}
+{"customerId": "8", "chat_history": [], "question": "what is the waterproof rating of the TrailMaster X4 Tent's rainfly?", "intent": "support"}
+{"customerId": "2", "question": "What is your return or exchange policy?", "chat_history": [], "intent": "support" }
+{"customerId": "6", "chat_history": [], "question": "is the jacket I bought machine washable?", "intent": "support"}
+{"customerId": "8", "chat_history": [], "question": "I would like to return the tent I bought. It is used but I still want to return it since the roof leaks.", "intent": "support"}
+{ "customerId": "4", "question": "tell me about your hiking jackets", "chat_history": [], "intent": "chat"}
+{ "customerId": "1", "question": "Do you have any climbing gear?", "chat_history": [], "intent": "chat" }
+{ "customerId": "3", "question": "Can you tell me about your selection of tents?", "chat_history": [], "intent": "chat" }
+{ "customerId": "6", "question": "Do you have any hiking boots?", "chat_history": [], "intent": "chat" }
+{ "customerId": "2", "question": "What gear do you recommend for hiking?", "chat_history": [], "intent": "chat" }
diff --git a/data/supporttestdata.jsonl b/data/supporttestdata.jsonl
@@ -1,7 +1,6 @@
 {"customerId": "7", "chat_history": [], "question": "what is the temperature rating of my sleeping bag?"}
 {"customerId": "7", "chat_history": [], "question": "what is the temperature rating of the cozynights sleeping bag?"}
 {"customerId": "8", "chat_history": [], "question": "what is the waterproof rating of the tent I bought?"}
-{"customerId": "8", "chat_history": [], "question": "what is the waterproof rating of the TrailMaster X4 Tent's rainfly?"}
 {"customerId": "2", "question": "What is your return or exchange policy?", "chat_history": [] }
 {"customerId": "6", "chat_history": [], "question": "is the jacket I bought machine washable?"}
 {"customerId": "8", "chat_history": [], "question": "I would like to return the tent I bought. It is used but I still want to return it since the roof leaks."}
diff --git a/eval/evaluate-intent-prompt-flow.ipynb b/eval/evaluate-intent-prompt-flow.ipynb
@@ -0,0 +1,330 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Local Evaluation - Groundedness\n",
+    "\n",
+    "After you have setup and configured the prompt flow, its time to evaluation its performance. Here we can use the prompt flow SDK to test different questions and see how the prompt flow performs using the evaluation prompt flows provided."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from promptflow import PFClient\n",
+    "pf_client = PFClient()\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "from pathlib import Path\n",
+    "load_dotenv(Path(\"../local.env\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add a question to test the base prompt flow.\n",
+    "question = \"How do I wash the jacket I purchased?\"\n",
+    "customerId = \"4\"\n",
+    "output = pf_client.test(\n",
+    "    flow=\"../contoso-intent\", # Path to the flow directory\n",
+    "    inputs={ # Inputs to the flow\n",
+    "        \"chat_history\": [],\n",
+    "        \"question\": question,\n",
+    "        \"customerId\": customerId,\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "output[\"answer\"] = \"\".join(list(output[\"answer\"]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test the groundedness of the prompt flow with the answer from the above question."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test = pf_client.test(\n",
+    "    flow=\"intent_eval\",\n",
+    "    inputs={\n",
+    "        \"question\": question,\n",
+    "        \"prediction\": str(output[\"intent_context\"]),\n",
+    "        \"groundtruth\": \"support\",\n",
+    "    },\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# AI Studio Azure batch run on an evaluation json dataset\n",
+    "\n",
+    "Now in order to test these more thoroughly, we can use the Azure AI Studio to run batches of test data with the evaluation prompt flow on a larger dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "# Import required libraries\n",
+    "from promptflow.azure import PFClient\n",
+    "\n",
+    "# Import required libraries\n",
+    "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    credential = DefaultAzureCredential()\n",
+    "    # Check if given credential can get token successfully.\n",
+    "    credential.get_token(\"https://management.azure.com/.default\")\n",
+    "except Exception as ex:\n",
+    "    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work\n",
+    "    credential = InteractiveBrowserCredential()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Populate the `config.json` file with the subscription_id, resource_group, and workspace_name."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config_path = \"../config.json\"\n",
+    "pf_azure_client = PFClient.from_config(credential=credential, path=config_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Add the runtime from the AI Studio that will be used for the cloud batch runs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Update the runtime to the name of the runtime you created previously\n",
+    "runtime = \"automatic\"\n",
+    "# load flow\n",
+    "flow = \"../contoso-intent\"\n",
+    "# load data\n",
+    "data = \"../data/intenttestdata.jsonl\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get current time stamp for run name\n",
+    "import datetime\n",
+    "now = datetime.datetime.now()\n",
+    "timestamp = now.strftime(\"%Y_%m_%d_%H%M%S\")\n",
+    "run_name = timestamp+\"_intent_base_run\"\n",
+    "print(run_name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a base run to use as the variant for the evaluation runs. \n",
+    "\n",
+    "_NOTE: If you get \"'An existing connection was forcibly closed by the remote host'\" run the cell again._"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create base run in Azure Ai Studio\n",
+    "base_run = pf_azure_client.run(\n",
+    "    flow=flow,\n",
+    "    data=data,\n",
+    "    column_mapping={\n",
+    "        # reference data\n",
+    "        \"customerId\": \"${data.customerId}\",\n",
+    "        \"question\": \"${data.question}\",\n",
+    "    },\n",
+    "    runtime=runtime,\n",
+    "    display_name=run_name,\n",
+    "    name=run_name\n",
+    ")\n",
+    "print(base_run)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pf_azure_client.stream(base_run)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "details = pf_azure_client.get_details(base_run)\n",
+    "details.head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Cloud Eval run on Json Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eval_flow = \"intent_eval/\"\n",
+    "run_name = timestamp+\"intent_eval_run\"\n",
+    "print(run_name)\n",
+    "\n",
+    "eval_run_variant = pf_azure_client.run(\n",
+    "    flow=eval_flow,\n",
+    "    data=data,  # path to the data file\n",
+    "    run=base_run,  # use run as the variant\n",
+    "    column_mapping={\n",
+    "        # reference data\n",
+    "        \"question\": \"${data.question}\",\n",
+    "        \"groundtruth\": \"${data.intent}\",\n",
+    "        \"prediction\": \"${run.intent_context}\",\n",
+    "    },\n",
+    "    runtime=runtime,\n",
+    "    display_name=run_name,\n",
+    "    name=run_name\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pf_azure_client.stream(eval_run_variant)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "details = pf_azure_client.get_details(eval_run_variant)\n",
+    "details.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "metrics = pf_azure_client.get_metrics(eval_run_variant)\n",
+    "print(json.dumps(metrics, indent=4))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pf_azure_client.visualize([base_run, eval_run_variant])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/eval/evaluate-support-prompt-flow.ipynb b/eval/evaluate-support-prompt-flow.ipynb
@@ -37,7 +37,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "rag_flow_baseline = \"../contoso-support/rag-flow-baseline\"\n",
+    "rag_flow_baseline = \"../contoso-support\"\n",
     "eval_flow = \"\""
    ]
   },