Skip to content

Commit

Permalink
Merge pull request #24 from cassiebreviu/main
Browse files Browse the repository at this point in the history
add intent evaluation
  • Loading branch information
cassiebreviu authored Jan 4, 2024
2 parents 3e28196 + 1fda2bd commit 8e2c5a1
Show file tree
Hide file tree
Showing 12 changed files with 466 additions and 6 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/run-chat-eval-pf-pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ jobs:
id: jobMetricAssert
run: |
# NOTE The number after the file is the threshold score to pass the assertion.
export ASSERT=$(python deployment/llmops-helper/assert.py eval_result.json 3) # NOTE <file>.json is the file name and decimal is the threshold for the assertion
export ASSERT=$(python deployment/llmops-helper/assert.py eval_result.json 2) # NOTE <file>.json is the file name and decimal is the threshold for the assertion
echo "::debug::Assert has returned the following value: $ASSERT"
# assert.py will return True or False, but bash expects lowercase.
if ${ASSERT,,} ; then
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/run-support-eval-pf-pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ jobs:
id: jobMetricAssert
run: |
# NOTE The number after the file is the threshold score to pass the assertion.
export ASSERT=$(python deployment/llmops-helper/assert.py eval_result.json 3) # NOTE <file>.json is the file name and decimal is the threshold for the assertion
export ASSERT=$(python deployment/llmops-helper/assert.py eval_result.json 2) # NOTE <file>.json is the file name and decimal is the threshold for the assertion
echo "::debug::Assert has returned the following value: $ASSERT"
# assert.py will return True or False, but bash expects lowercase.
if ${ASSERT,,} ; then
Expand Down
7 changes: 5 additions & 2 deletions contoso-intent/flow.dag.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
id: template_chat_flow
name: Template Chat Flow
id: intent_flow
name: Intent Flow
environment:
python_requirements_txt: requirements.txt
inputs:
Expand All @@ -18,6 +18,9 @@ outputs:
type: string
reference: ${run_chat_or_support.output}
is_chat_output: true
intent_context:
type: string
reference: ${classify_intent_llm.output}
nodes:
- name: classify_intent_prompt
type: prompt
Expand Down
12 changes: 12 additions & 0 deletions data/intenttestdata.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{"customerId": "7", "chat_history": [], "question": "what is the temperature rating of my sleeping bag?", "intent": "support"}
{"customerId": "7", "chat_history": [], "question": "what is the temperature rating of the cozynights sleeping bag?", "intent": "support"}
{"customerId": "8", "chat_history": [], "question": "what is the waterproof rating of the tent I bought?", "intent": "support"}
{"customerId": "8", "chat_history": [], "question": "what is the waterproof rating of the TrailMaster X4 Tent's rainfly?", "intent": "support"}
{"customerId": "2", "question": "What is your return or exchange policy?", "chat_history": [], "intent": "support" }
{"customerId": "6", "chat_history": [], "question": "is the jacket I bought machine washable?", "intent": "support"}
{"customerId": "8", "chat_history": [], "question": "I would like to return the tent I bought. It is used but I still want to return it since the roof leaks.", "intent": "support"}
{ "customerId": "4", "question": "tell me about your hiking jackets", "chat_history": [], "intent": "chat"}
{ "customerId": "1", "question": "Do you have any climbing gear?", "chat_history": [], "intent": "chat" }
{ "customerId": "3", "question": "Can you tell me about your selection of tents?", "chat_history": [], "intent": "chat" }
{ "customerId": "6", "question": "Do you have any hiking boots?", "chat_history": [], "intent": "chat" }
{ "customerId": "2", "question": "What gear do you recommend for hiking?", "chat_history": [], "intent": "chat" }
1 change: 0 additions & 1 deletion data/supporttestdata.jsonl
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
{"customerId": "7", "chat_history": [], "question": "what is the temperature rating of my sleeping bag?"}
{"customerId": "7", "chat_history": [], "question": "what is the temperature rating of the cozynights sleeping bag?"}
{"customerId": "8", "chat_history": [], "question": "what is the waterproof rating of the tent I bought?"}
{"customerId": "8", "chat_history": [], "question": "what is the waterproof rating of the TrailMaster X4 Tent's rainfly?"}
{"customerId": "2", "question": "What is your return or exchange policy?", "chat_history": [] }
{"customerId": "6", "chat_history": [], "question": "is the jacket I bought machine washable?"}
{"customerId": "8", "chat_history": [], "question": "I would like to return the tent I bought. It is used but I still want to return it since the roof leaks."}
330 changes: 330 additions & 0 deletions eval/evaluate-intent-prompt-flow.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,330 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Local Evaluation - Groundedness\n",
"\n",
"After you have setup and configured the prompt flow, its time to evaluation its performance. Here we can use the prompt flow SDK to test different questions and see how the prompt flow performs using the evaluation prompt flows provided."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from promptflow import PFClient\n",
"pf_client = PFClient()\n",
"\n",
"from dotenv import load_dotenv\n",
"\n",
"from pathlib import Path\n",
"load_dotenv(Path(\"../local.env\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Add a question to test the base prompt flow.\n",
"question = \"How do I wash the jacket I purchased?\"\n",
"customerId = \"4\"\n",
"output = pf_client.test(\n",
" flow=\"../contoso-intent\", # Path to the flow directory\n",
" inputs={ # Inputs to the flow\n",
" \"chat_history\": [],\n",
" \"question\": question,\n",
" \"customerId\": customerId,\n",
" },\n",
")\n",
"\n",
"output[\"answer\"] = \"\".join(list(output[\"answer\"]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"output"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Test the groundedness of the prompt flow with the answer from the above question."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test = pf_client.test(\n",
" flow=\"intent_eval\",\n",
" inputs={\n",
" \"question\": question,\n",
" \"prediction\": str(output[\"intent_context\"]),\n",
" \"groundtruth\": \"support\",\n",
" },\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# AI Studio Azure batch run on an evaluation json dataset\n",
"\n",
"Now in order to test these more thoroughly, we can use the Azure AI Studio to run batches of test data with the evaluation prompt flow on a larger dataset."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"# Import required libraries\n",
"from promptflow.azure import PFClient\n",
"\n",
"# Import required libraries\n",
"from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" credential = DefaultAzureCredential()\n",
" # Check if given credential can get token successfully.\n",
" credential.get_token(\"https://management.azure.com/.default\")\n",
"except Exception as ex:\n",
" # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work\n",
" credential = InteractiveBrowserCredential()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Populate the `config.json` file with the subscription_id, resource_group, and workspace_name."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"config_path = \"../config.json\"\n",
"pf_azure_client = PFClient.from_config(credential=credential, path=config_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Add the runtime from the AI Studio that will be used for the cloud batch runs."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Update the runtime to the name of the runtime you created previously\n",
"runtime = \"automatic\"\n",
"# load flow\n",
"flow = \"../contoso-intent\"\n",
"# load data\n",
"data = \"../data/intenttestdata.jsonl\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# get current time stamp for run name\n",
"import datetime\n",
"now = datetime.datetime.now()\n",
"timestamp = now.strftime(\"%Y_%m_%d_%H%M%S\")\n",
"run_name = timestamp+\"_intent_base_run\"\n",
"print(run_name)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a base run to use as the variant for the evaluation runs. \n",
"\n",
"_NOTE: If you get \"'An existing connection was forcibly closed by the remote host'\" run the cell again._"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# create base run in Azure Ai Studio\n",
"base_run = pf_azure_client.run(\n",
" flow=flow,\n",
" data=data,\n",
" column_mapping={\n",
" # reference data\n",
" \"customerId\": \"${data.customerId}\",\n",
" \"question\": \"${data.question}\",\n",
" },\n",
" runtime=runtime,\n",
" display_name=run_name,\n",
" name=run_name\n",
")\n",
"print(base_run)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pf_azure_client.stream(base_run)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"details = pf_azure_client.get_details(base_run)\n",
"details.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cloud Eval run on Json Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"eval_flow = \"intent_eval/\"\n",
"run_name = timestamp+\"intent_eval_run\"\n",
"print(run_name)\n",
"\n",
"eval_run_variant = pf_azure_client.run(\n",
" flow=eval_flow,\n",
" data=data, # path to the data file\n",
" run=base_run, # use run as the variant\n",
" column_mapping={\n",
" # reference data\n",
" \"question\": \"${data.question}\",\n",
" \"groundtruth\": \"${data.intent}\",\n",
" \"prediction\": \"${run.intent_context}\",\n",
" },\n",
" runtime=runtime,\n",
" display_name=run_name,\n",
" name=run_name\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pf_azure_client.stream(eval_run_variant)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"details = pf_azure_client.get_details(eval_run_variant)\n",
"details.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"metrics = pf_azure_client.get_metrics(eval_run_variant)\n",
"print(json.dumps(metrics, indent=4))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pf_azure_client.visualize([base_run, eval_run_variant])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
2 changes: 1 addition & 1 deletion eval/evaluate-support-prompt-flow.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
"metadata": {},
"outputs": [],
"source": [
"rag_flow_baseline = \"../contoso-support/rag-flow-baseline\"\n",
"rag_flow_baseline = \"../contoso-support\"\n",
"eval_flow = \"\""
]
},
Expand Down
Loading

0 comments on commit 8e2c5a1

Please sign in to comment.