-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* fix evaluations * Configure Azure Developer Pipeline * remove env var from chat.prompty * Configure Azure Developer Pipeline
- Loading branch information
1 parent
b0863c5
commit a95d01c
Showing
13 changed files
with
397 additions
and
344 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
.git* | ||
.venv/ | ||
**/*.pyc | ||
**/*.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"import json\n", | ||
"import prompty\n", | ||
"from evaluators.custom_evals.coherence import coherence_evaluation\n", | ||
"from evaluators.custom_evals.relevance import relevance_evaluation\n", | ||
"from evaluators.custom_evals.fluency import fluency_evaluation\n", | ||
"from evaluators.custom_evals.groundedness import groundedness_evaluation\n", | ||
"import jsonlines\n", | ||
"import pandas as pd\n", | ||
"from contoso_chat.chat_request import get_response" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Get output from data and save to results jsonl file" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def load_data():\n", | ||
" data_path = \"./evaluators/data.jsonl\"\n", | ||
"\n", | ||
" df = pd.read_json(data_path, lines=True)\n", | ||
" df.head()\n", | ||
" return df" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"\n", | ||
"def create_response_data(df):\n", | ||
" results = []\n", | ||
"\n", | ||
" for index, row in df.iterrows():\n", | ||
" customerId = row['customerId']\n", | ||
" question = row['question']\n", | ||
" \n", | ||
" # Run contoso-chat/chat_request flow to get response\n", | ||
" response = get_response(customerId=customerId, question=question, chat_history=[])\n", | ||
" print(response)\n", | ||
" \n", | ||
" # Add results to list\n", | ||
" result = {\n", | ||
" 'question': question,\n", | ||
" 'context': response[\"context\"],\n", | ||
" 'answer': response[\"answer\"]\n", | ||
" }\n", | ||
" results.append(result)\n", | ||
"\n", | ||
" # Save results to a JSONL file\n", | ||
" with open('result.jsonl', 'w') as file:\n", | ||
" for result in results:\n", | ||
" file.write(json.dumps(result) + '\\n')\n", | ||
" return results" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def evaluate():\n", | ||
" # Evaluate results from results file\n", | ||
" results_path = 'result.jsonl'\n", | ||
" results = []\n", | ||
" with open(results_path, 'r') as file:\n", | ||
" for line in file:\n", | ||
" print(line)\n", | ||
" results.append(json.loads(line))\n", | ||
"\n", | ||
" for result in results:\n", | ||
" question = result['question']\n", | ||
" context = result['context']\n", | ||
" answer = result['answer']\n", | ||
" \n", | ||
" groundedness_score = groundedness_evaluation(question=question, answer=answer, context=context)\n", | ||
" fluency_score = fluency_evaluation(question=question, answer=answer, context=context)\n", | ||
" coherence_score = coherence_evaluation(question=question, answer=answer, context=context)\n", | ||
" relevance_score = relevance_evaluation(question=question, answer=answer, context=context)\n", | ||
" \n", | ||
" result['groundedness'] = groundedness_score\n", | ||
" result['fluency'] = fluency_score\n", | ||
" result['coherence'] = coherence_score\n", | ||
" result['relevance'] = relevance_score\n", | ||
"\n", | ||
" # Save results to a JSONL file\n", | ||
" with open('result_evaluated.jsonl', 'w') as file:\n", | ||
" for result in results:\n", | ||
" file.write(json.dumps(result) + '\\n')\n", | ||
"\n", | ||
" with jsonlines.open('eval_results.jsonl', 'w') as writer:\n", | ||
" writer.write(results)\n", | ||
" # Print results\n", | ||
"\n", | ||
" df = pd.read_json('result_evaluated.jsonl', lines=True)\n", | ||
" df.head()\n", | ||
" \n", | ||
" return df" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def create_summary(df):\n", | ||
" print(\"Evaluation summary:\\n\")\n", | ||
" print(df)\n", | ||
" # drop question, context and answer\n", | ||
" mean_df = df.drop([\"question\", \"context\", \"answer\"], axis=1).mean()\n", | ||
" print(\"\\nAverage scores:\")\n", | ||
" print(mean_df)\n", | ||
" df.to_markdown('eval_results.md')\n", | ||
" with open('eval_results.md', 'a') as file:\n", | ||
" file.write(\"\\n\\nAverages scores:\\n\\n\")\n", | ||
" mean_df.to_markdown('eval_results.md', 'a')\n", | ||
"\n", | ||
" print(\"Results saved to result_evaluated.jsonl\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# create main funciton for python script\n", | ||
"if __name__ == \"__main__\":\n", | ||
"\n", | ||
" test_data_df = load_data()\n", | ||
" response_results = create_response_data(test_data_df)\n", | ||
" result_evaluated = evaluate()\n", | ||
" create_summary(result_evaluated)\n" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "pf-prompty", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.9" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
# %% | ||
import os | ||
import json | ||
import prompty | ||
from evaluators.custom_evals.coherence import coherence_evaluation | ||
from evaluators.custom_evals.relevance import relevance_evaluation | ||
from evaluators.custom_evals.fluency import fluency_evaluation | ||
from evaluators.custom_evals.groundedness import groundedness_evaluation | ||
import jsonlines | ||
import pandas as pd | ||
from contoso_chat.chat_request import get_response | ||
|
||
# %% [markdown] | ||
# ## Get output from data and save to results jsonl file | ||
|
||
# %% | ||
def load_data(): | ||
data_path = "./evaluators/data.jsonl" | ||
|
||
df = pd.read_json(data_path, lines=True) | ||
df.head() | ||
return df | ||
|
||
# %% | ||
|
||
def create_response_data(df): | ||
results = [] | ||
|
||
for index, row in df.iterrows(): | ||
customerId = row['customerId'] | ||
question = row['question'] | ||
|
||
# Run contoso-chat/chat_request flow to get response | ||
response = get_response(customerId=customerId, question=question, chat_history=[]) | ||
print(response) | ||
|
||
# Add results to list | ||
result = { | ||
'question': question, | ||
'context': response["context"], | ||
'answer': response["answer"] | ||
} | ||
results.append(result) | ||
|
||
# Save results to a JSONL file | ||
with open('result.jsonl', 'w') as file: | ||
for result in results: | ||
file.write(json.dumps(result) + '\n') | ||
return results | ||
|
||
# %% | ||
def evaluate(): | ||
# Evaluate results from results file | ||
results_path = 'result.jsonl' | ||
results = [] | ||
with open(results_path, 'r') as file: | ||
for line in file: | ||
print(line) | ||
results.append(json.loads(line)) | ||
|
||
for result in results: | ||
question = result['question'] | ||
context = result['context'] | ||
answer = result['answer'] | ||
|
||
groundedness_score = groundedness_evaluation(question=question, answer=answer, context=context) | ||
fluency_score = fluency_evaluation(question=question, answer=answer, context=context) | ||
coherence_score = coherence_evaluation(question=question, answer=answer, context=context) | ||
relevance_score = relevance_evaluation(question=question, answer=answer, context=context) | ||
|
||
result['groundedness'] = groundedness_score | ||
result['fluency'] = fluency_score | ||
result['coherence'] = coherence_score | ||
result['relevance'] = relevance_score | ||
|
||
# Save results to a JSONL file | ||
with open('result_evaluated.jsonl', 'w') as file: | ||
for result in results: | ||
file.write(json.dumps(result) + '\n') | ||
|
||
with jsonlines.open('eval_results.jsonl', 'w') as writer: | ||
writer.write(results) | ||
# Print results | ||
|
||
df = pd.read_json('result_evaluated.jsonl', lines=True) | ||
df.head() | ||
|
||
return df | ||
|
||
# %% | ||
def create_summary(df): | ||
print("Evaluation summary:\n") | ||
print(df) | ||
# drop question, context and answer | ||
mean_df = df.drop(["question", "context", "answer"], axis=1).mean() | ||
print("\nAverage scores:") | ||
print(mean_df) | ||
df.to_markdown('eval_results.md') | ||
with open('eval_results.md', 'a') as file: | ||
file.write("\n\nAverages scores:\n\n") | ||
mean_df.to_markdown('eval_results.md', 'a') | ||
|
||
print("Results saved to result_evaluated.jsonl") | ||
|
||
# %% | ||
# create main funciton for python script | ||
if __name__ == "__main__": | ||
|
||
test_data_df = load_data() | ||
response_results = create_response_data(test_data_df) | ||
result_evaluated = evaluate() | ||
create_summary(result_evaluated) | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.