diff --git a/.github/workflows/run-chat-eval-pf-pipeline.yml b/.github/workflows/run-chat-eval-pf-pipeline.yml index 32e46c61..680ea240 100644 --- a/.github/workflows/run-chat-eval-pf-pipeline.yml +++ b/.github/workflows/run-chat-eval-pf-pipeline.yml @@ -67,7 +67,7 @@ jobs: id: jobMetricAssert run: | # NOTE The number after the file is the threshold score to pass the assertion. - export ASSERT=$(python deployment/llmops-helper/assert.py eval_result.json 3) # NOTE .json is the file name and decimal is the threshold for the assertion + export ASSERT=$(python deployment/llmops-helper/assert.py eval_result.json 2) # NOTE .json is the file name and decimal is the threshold for the assertion echo "::debug::Assert has returned the following value: $ASSERT" # assert.py will return True or False, but bash expects lowercase. if ${ASSERT,,} ; then diff --git a/.github/workflows/run-support-eval-pf-pipeline.yml b/.github/workflows/run-support-eval-pf-pipeline.yml index 1f7fae88..bcff03f9 100644 --- a/.github/workflows/run-support-eval-pf-pipeline.yml +++ b/.github/workflows/run-support-eval-pf-pipeline.yml @@ -67,7 +67,7 @@ jobs: id: jobMetricAssert run: | # NOTE The number after the file is the threshold score to pass the assertion. - export ASSERT=$(python deployment/llmops-helper/assert.py eval_result.json 3) # NOTE .json is the file name and decimal is the threshold for the assertion + export ASSERT=$(python deployment/llmops-helper/assert.py eval_result.json 2) # NOTE .json is the file name and decimal is the threshold for the assertion echo "::debug::Assert has returned the following value: $ASSERT" # assert.py will return True or False, but bash expects lowercase. if ${ASSERT,,} ; then diff --git a/contoso-intent/flow.dag.yaml b/contoso-intent/flow.dag.yaml index c802d2d0..c3170e37 100644 --- a/contoso-intent/flow.dag.yaml +++ b/contoso-intent/flow.dag.yaml @@ -1,5 +1,5 @@ -id: template_chat_flow -name: Template Chat Flow +id: intent_flow +name: Intent Flow environment: python_requirements_txt: requirements.txt inputs: @@ -18,6 +18,9 @@ outputs: type: string reference: ${run_chat_or_support.output} is_chat_output: true + intent_context: + type: string + reference: ${classify_intent_llm.output} nodes: - name: classify_intent_prompt type: prompt diff --git a/data/intenttestdata.jsonl b/data/intenttestdata.jsonl new file mode 100644 index 00000000..9add3145 --- /dev/null +++ b/data/intenttestdata.jsonl @@ -0,0 +1,12 @@ +{"customerId": "7", "chat_history": [], "question": "what is the temperature rating of my sleeping bag?", "intent": "support"} +{"customerId": "7", "chat_history": [], "question": "what is the temperature rating of the cozynights sleeping bag?", "intent": "support"} +{"customerId": "8", "chat_history": [], "question": "what is the waterproof rating of the tent I bought?", "intent": "support"} +{"customerId": "8", "chat_history": [], "question": "what is the waterproof rating of the TrailMaster X4 Tent's rainfly?", "intent": "support"} +{"customerId": "2", "question": "What is your return or exchange policy?", "chat_history": [], "intent": "support" } +{"customerId": "6", "chat_history": [], "question": "is the jacket I bought machine washable?", "intent": "support"} +{"customerId": "8", "chat_history": [], "question": "I would like to return the tent I bought. It is used but I still want to return it since the roof leaks.", "intent": "support"} +{ "customerId": "4", "question": "tell me about your hiking jackets", "chat_history": [], "intent": "chat"} +{ "customerId": "1", "question": "Do you have any climbing gear?", "chat_history": [], "intent": "chat" } +{ "customerId": "3", "question": "Can you tell me about your selection of tents?", "chat_history": [], "intent": "chat" } +{ "customerId": "6", "question": "Do you have any hiking boots?", "chat_history": [], "intent": "chat" } +{ "customerId": "2", "question": "What gear do you recommend for hiking?", "chat_history": [], "intent": "chat" } diff --git a/data/supporttestdata.jsonl b/data/supporttestdata.jsonl index bbdc2cc3..503917ea 100644 --- a/data/supporttestdata.jsonl +++ b/data/supporttestdata.jsonl @@ -1,7 +1,6 @@ {"customerId": "7", "chat_history": [], "question": "what is the temperature rating of my sleeping bag?"} {"customerId": "7", "chat_history": [], "question": "what is the temperature rating of the cozynights sleeping bag?"} {"customerId": "8", "chat_history": [], "question": "what is the waterproof rating of the tent I bought?"} -{"customerId": "8", "chat_history": [], "question": "what is the waterproof rating of the TrailMaster X4 Tent's rainfly?"} {"customerId": "2", "question": "What is your return or exchange policy?", "chat_history": [] } {"customerId": "6", "chat_history": [], "question": "is the jacket I bought machine washable?"} {"customerId": "8", "chat_history": [], "question": "I would like to return the tent I bought. It is used but I still want to return it since the roof leaks."} diff --git a/eval/evaluate-intent-prompt-flow.ipynb b/eval/evaluate-intent-prompt-flow.ipynb new file mode 100644 index 00000000..0e65ed6a --- /dev/null +++ b/eval/evaluate-intent-prompt-flow.ipynb @@ -0,0 +1,330 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Local Evaluation - Groundedness\n", + "\n", + "After you have setup and configured the prompt flow, its time to evaluation its performance. Here we can use the prompt flow SDK to test different questions and see how the prompt flow performs using the evaluation prompt flows provided." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from promptflow import PFClient\n", + "pf_client = PFClient()\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "from pathlib import Path\n", + "load_dotenv(Path(\"../local.env\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Add a question to test the base prompt flow.\n", + "question = \"How do I wash the jacket I purchased?\"\n", + "customerId = \"4\"\n", + "output = pf_client.test(\n", + " flow=\"../contoso-intent\", # Path to the flow directory\n", + " inputs={ # Inputs to the flow\n", + " \"chat_history\": [],\n", + " \"question\": question,\n", + " \"customerId\": customerId,\n", + " },\n", + ")\n", + "\n", + "output[\"answer\"] = \"\".join(list(output[\"answer\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Test the groundedness of the prompt flow with the answer from the above question." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test = pf_client.test(\n", + " flow=\"intent_eval\",\n", + " inputs={\n", + " \"question\": question,\n", + " \"prediction\": str(output[\"intent_context\"]),\n", + " \"groundtruth\": \"support\",\n", + " },\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# AI Studio Azure batch run on an evaluation json dataset\n", + "\n", + "Now in order to test these more thoroughly, we can use the Azure AI Studio to run batches of test data with the evaluation prompt flow on a larger dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "# Import required libraries\n", + "from promptflow.azure import PFClient\n", + "\n", + "# Import required libraries\n", + "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " credential = DefaultAzureCredential()\n", + " # Check if given credential can get token successfully.\n", + " credential.get_token(\"https://management.azure.com/.default\")\n", + "except Exception as ex:\n", + " # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work\n", + " credential = InteractiveBrowserCredential()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Populate the `config.json` file with the subscription_id, resource_group, and workspace_name." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config_path = \"../config.json\"\n", + "pf_azure_client = PFClient.from_config(credential=credential, path=config_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Add the runtime from the AI Studio that will be used for the cloud batch runs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Update the runtime to the name of the runtime you created previously\n", + "runtime = \"automatic\"\n", + "# load flow\n", + "flow = \"../contoso-intent\"\n", + "# load data\n", + "data = \"../data/intenttestdata.jsonl\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get current time stamp for run name\n", + "import datetime\n", + "now = datetime.datetime.now()\n", + "timestamp = now.strftime(\"%Y_%m_%d_%H%M%S\")\n", + "run_name = timestamp+\"_intent_base_run\"\n", + "print(run_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a base run to use as the variant for the evaluation runs. \n", + "\n", + "_NOTE: If you get \"'An existing connection was forcibly closed by the remote host'\" run the cell again._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create base run in Azure Ai Studio\n", + "base_run = pf_azure_client.run(\n", + " flow=flow,\n", + " data=data,\n", + " column_mapping={\n", + " # reference data\n", + " \"customerId\": \"${data.customerId}\",\n", + " \"question\": \"${data.question}\",\n", + " },\n", + " runtime=runtime,\n", + " display_name=run_name,\n", + " name=run_name\n", + ")\n", + "print(base_run)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pf_azure_client.stream(base_run)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "details = pf_azure_client.get_details(base_run)\n", + "details.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cloud Eval run on Json Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "eval_flow = \"intent_eval/\"\n", + "run_name = timestamp+\"intent_eval_run\"\n", + "print(run_name)\n", + "\n", + "eval_run_variant = pf_azure_client.run(\n", + " flow=eval_flow,\n", + " data=data, # path to the data file\n", + " run=base_run, # use run as the variant\n", + " column_mapping={\n", + " # reference data\n", + " \"question\": \"${data.question}\",\n", + " \"groundtruth\": \"${data.intent}\",\n", + " \"prediction\": \"${run.intent_context}\",\n", + " },\n", + " runtime=runtime,\n", + " display_name=run_name,\n", + " name=run_name\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pf_azure_client.stream(eval_run_variant)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "details = pf_azure_client.get_details(eval_run_variant)\n", + "details.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "metrics = pf_azure_client.get_metrics(eval_run_variant)\n", + "print(json.dumps(metrics, indent=4))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pf_azure_client.visualize([base_run, eval_run_variant])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/eval/evaluate-support-prompt-flow.ipynb b/eval/evaluate-support-prompt-flow.ipynb index c45adb8d..5830805e 100644 --- a/eval/evaluate-support-prompt-flow.ipynb +++ b/eval/evaluate-support-prompt-flow.ipynb @@ -37,7 +37,7 @@ "metadata": {}, "outputs": [], "source": [ - "rag_flow_baseline = \"../contoso-support/rag-flow-baseline\"\n", + "rag_flow_baseline = \"../contoso-support\"\n", "eval_flow = \"\"" ] }, diff --git a/eval/intent_eval/assert_value.py b/eval/intent_eval/assert_value.py new file mode 100644 index 00000000..65caaa85 --- /dev/null +++ b/eval/intent_eval/assert_value.py @@ -0,0 +1,18 @@ +from promptflow import tool + + +@tool +def assert_value(groundtruth: str, prediction: str): + """ + This tool processes the prediction of a single line and returns the processed result. + + :param groundtruth: the "chat" or "support" value of a single line. + :param prediction: the prediction of gpt 35 turbo model. + """ + # Check if prediction include groundtruth + if groundtruth in prediction: + return "True" + else: + return "False" + + diff --git a/eval/intent_eval/flow.dag.yaml b/eval/intent_eval/flow.dag.yaml new file mode 100644 index 00000000..e1df5d59 --- /dev/null +++ b/eval/intent_eval/flow.dag.yaml @@ -0,0 +1,41 @@ +id: intent_eval_flow +name: Intent Evaluation Flow +environment: + python_requirements_txt: requirements.txt +inputs: + groundtruth: + type: string + default: support + question: + type: string + default: What was in my last order +outputs: + results: + type: string + reference: ${get_accuracy.output} +nodes: +- name: llm_call + type: llm + source: + type: code + path: intent.jinja2 + inputs: + question: ${inputs.question} + deployment_name: gpt-35-turbo + connection: aoai-connection + api: chat +- name: assert_value + type: python + source: + type: code + path: assert_value.py + inputs: + groundtruth: ${inputs.groundtruth} + prediction: ${llm_call.output} +- name: get_accuracy + type: python + source: + type: code + path: get_accuracy.py + inputs: + processed_results: ${assert_value.output} diff --git a/eval/intent_eval/get_accuracy.py b/eval/intent_eval/get_accuracy.py new file mode 100644 index 00000000..02cad10d --- /dev/null +++ b/eval/intent_eval/get_accuracy.py @@ -0,0 +1,26 @@ +from typing import List +from promptflow import tool + + +@tool +def get_accuracy(processed_results: str): + """ + This tool aggregates the processed result of all lines and log metric. + + :param processed_results: List of the output of line_process node. + """ + + # Loop thru results and get number of true and false predictions + true_count = 0 + false_count = 0 + + load_list = [processed_results] + for result in load_list: + if result == "True": + true_count += 1 + else: + false_count += 1 + + # Calculate accuracy + accuracy = (true_count / (true_count + false_count)) * 100 + return {"accuracy": accuracy} \ No newline at end of file diff --git a/eval/intent_eval/intent.jinja2 b/eval/intent_eval/intent.jinja2 new file mode 100644 index 00000000..60c4d3c4 --- /dev/null +++ b/eval/intent_eval/intent.jinja2 @@ -0,0 +1,28 @@ +system: +You're an AI assistant reading the transcript of a conversation between a user and an +assistant. Given the chat history, customer info, and user's query, infer user's intent expressed in the last query by the user. + +This value should always be a "support" or "chat". So the intent produced and response should only be the string of support or chat. + +Be specific in what the user is asking about but disregard the parts of the chat history and customer info that are not relevant to the user's intent. +For instance with a chat history like the below: + + +Examples: + +question: What was in my last order? +intent: support + +question: What is the status of my order? +intent: support + +question: Can you recommend a 4-person tent? +intent: chat + +question: Can you recommend pair of shoes? +intent: chat + +question: can you suggest a coat that would go with the shoes I purchased? +intent: chat + +question: {{question}} \ No newline at end of file diff --git a/eval/intent_eval/requirements.txt b/eval/intent_eval/requirements.txt new file mode 100644 index 00000000..e0f4c480 --- /dev/null +++ b/eval/intent_eval/requirements.txt @@ -0,0 +1,3 @@ +promptflow +promptflow-tools +azure-ai-ml \ No newline at end of file