diff --git a/.github/workflows/evaluations.yaml b/.github/workflows/evaluations.yaml index 4591e0b7..ecdcf8bc 100644 --- a/.github/workflows/evaluations.yaml +++ b/.github/workflows/evaluations.yaml @@ -66,13 +66,13 @@ jobs: - name: evaluate working-directory: ./src/api run: | - python -m evaluators.evaluate + python -m evaluate - name: Upload eval results as build artifact uses: actions/upload-artifact@v4 with: name: eval_result - path: ./src/api/evaluators/eval_results.jsonl + path: ./src/api/eval_results.jsonl - name: GitHub Summary Step if: ${{ success() }} diff --git a/.gitignore b/.gitignore index 39420933..45c9575b 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,7 @@ src/api/evaluators/result.jsonl src/api/evaluators/eval_results.jsonl src/api/evaluators/eval_results.md src/api/evaluators/.runs/* +src/api/result_evaluated.jsonl +src/api/result.jsonl +src/api/eval_results.jsonl +src/api/eval_results.md diff --git a/azure.yaml b/azure.yaml index 8b3814b0..53aac722 100644 --- a/azure.yaml +++ b/azure.yaml @@ -14,4 +14,34 @@ hooks: interactive: true run: infra/hooks/postprovision.ps1 infra: - provider: "bicep" \ No newline at end of file + provider: "bicep" + +pipeline: + variables: + - APPINSIGHTS_CONNECTIONSTRING + - AZURE_CONTAINER_ENVIRONMENT_NAME + - AZURE_CONTAINER_REGISTRY_ENDPOINT + - AZURE_CONTAINER_REGISTRY_NAME + - AZURE_COSMOS_NAME + - AZURE_EMBEDDING_NAME + - AZURE_ENV_NAME + - AZURE_LOCATION + - AZURE_OPENAI_API_VERSION + - AZURE_OPENAI_CHAT_DEPLOYMENT + - AZURE_OPENAI_ENDPOINT + - AZURE_OPENAI_NAME + - AZURE_OPENAI_RESOURCE_GROUP_LOCATION + - AZURE_OPENAI_RESOURCE_GROUP_LOCATION + - AZURE_RESOURCE_GROUP + - AZURE_SEARCH_ENDPOINT + - AZURE_SEARCH_NAME + - AZURE_SUBSCRIPTION_ID + - COSMOS_CONTAINER + - COSMOS_ENDPOINT + - OPENAI_TYPE + - SERVICE_ACA_IMAGE_NAME + - SERVICE_ACA_NAME + - SERVICE_ACA_URI + + secrets: + - BING_SEARCH_KEY diff --git a/src/.dockerignore b/src/.dockerignore index 24a736dd..168d6981 100644 --- a/src/.dockerignore +++ b/src/.dockerignore @@ -1,3 +1,3 @@ .git* .venv/ -**/*.pyc +**/*.pyc \ No newline at end of file diff --git a/src/api/contoso_chat/chat.prompty b/src/api/contoso_chat/chat.prompty index 64a97957..dd284cb1 100644 --- a/src/api/contoso_chat/chat.prompty +++ b/src/api/contoso_chat/chat.prompty @@ -8,7 +8,6 @@ model: configuration: type: azure_openai azure_deployment: gpt-35-turbo - azure_endpoint: ${ENV:AZURE_OPENAI_ENDPOINT} api_version: 2023-07-01-preview parameters: max_tokens: 128 diff --git a/src/api/evaluate-chat-flow.ipynb b/src/api/evaluate-chat-flow.ipynb new file mode 100644 index 00000000..4e15513a --- /dev/null +++ b/src/api/evaluate-chat-flow.ipynb @@ -0,0 +1,178 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import prompty\n", + "from evaluators.custom_evals.coherence import coherence_evaluation\n", + "from evaluators.custom_evals.relevance import relevance_evaluation\n", + "from evaluators.custom_evals.fluency import fluency_evaluation\n", + "from evaluators.custom_evals.groundedness import groundedness_evaluation\n", + "import jsonlines\n", + "import pandas as pd\n", + "from contoso_chat.chat_request import get_response" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get output from data and save to results jsonl file" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def load_data():\n", + " data_path = \"./evaluators/data.jsonl\"\n", + "\n", + " df = pd.read_json(data_path, lines=True)\n", + " df.head()\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def create_response_data(df):\n", + " results = []\n", + "\n", + " for index, row in df.iterrows():\n", + " customerId = row['customerId']\n", + " question = row['question']\n", + " \n", + " # Run contoso-chat/chat_request flow to get response\n", + " response = get_response(customerId=customerId, question=question, chat_history=[])\n", + " print(response)\n", + " \n", + " # Add results to list\n", + " result = {\n", + " 'question': question,\n", + " 'context': response[\"context\"],\n", + " 'answer': response[\"answer\"]\n", + " }\n", + " results.append(result)\n", + "\n", + " # Save results to a JSONL file\n", + " with open('result.jsonl', 'w') as file:\n", + " for result in results:\n", + " file.write(json.dumps(result) + '\\n')\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def evaluate():\n", + " # Evaluate results from results file\n", + " results_path = 'result.jsonl'\n", + " results = []\n", + " with open(results_path, 'r') as file:\n", + " for line in file:\n", + " print(line)\n", + " results.append(json.loads(line))\n", + "\n", + " for result in results:\n", + " question = result['question']\n", + " context = result['context']\n", + " answer = result['answer']\n", + " \n", + " groundedness_score = groundedness_evaluation(question=question, answer=answer, context=context)\n", + " fluency_score = fluency_evaluation(question=question, answer=answer, context=context)\n", + " coherence_score = coherence_evaluation(question=question, answer=answer, context=context)\n", + " relevance_score = relevance_evaluation(question=question, answer=answer, context=context)\n", + " \n", + " result['groundedness'] = groundedness_score\n", + " result['fluency'] = fluency_score\n", + " result['coherence'] = coherence_score\n", + " result['relevance'] = relevance_score\n", + "\n", + " # Save results to a JSONL file\n", + " with open('result_evaluated.jsonl', 'w') as file:\n", + " for result in results:\n", + " file.write(json.dumps(result) + '\\n')\n", + "\n", + " with jsonlines.open('eval_results.jsonl', 'w') as writer:\n", + " writer.write(results)\n", + " # Print results\n", + "\n", + " df = pd.read_json('result_evaluated.jsonl', lines=True)\n", + " df.head()\n", + " \n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def create_summary(df):\n", + " print(\"Evaluation summary:\\n\")\n", + " print(df)\n", + " # drop question, context and answer\n", + " mean_df = df.drop([\"question\", \"context\", \"answer\"], axis=1).mean()\n", + " print(\"\\nAverage scores:\")\n", + " print(mean_df)\n", + " df.to_markdown('eval_results.md')\n", + " with open('eval_results.md', 'a') as file:\n", + " file.write(\"\\n\\nAverages scores:\\n\\n\")\n", + " mean_df.to_markdown('eval_results.md', 'a')\n", + "\n", + " print(\"Results saved to result_evaluated.jsonl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create main funciton for python script\n", + "if __name__ == \"__main__\":\n", + "\n", + " test_data_df = load_data()\n", + " response_results = create_response_data(test_data_df)\n", + " result_evaluated = evaluate()\n", + " create_summary(result_evaluated)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pf-prompty", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/api/evaluate.py b/src/api/evaluate.py new file mode 100644 index 00000000..8bdcf6b7 --- /dev/null +++ b/src/api/evaluate.py @@ -0,0 +1,115 @@ +# %% +import os +import json +import prompty +from evaluators.custom_evals.coherence import coherence_evaluation +from evaluators.custom_evals.relevance import relevance_evaluation +from evaluators.custom_evals.fluency import fluency_evaluation +from evaluators.custom_evals.groundedness import groundedness_evaluation +import jsonlines +import pandas as pd +from contoso_chat.chat_request import get_response + +# %% [markdown] +# ## Get output from data and save to results jsonl file + +# %% +def load_data(): + data_path = "./evaluators/data.jsonl" + + df = pd.read_json(data_path, lines=True) + df.head() + return df + +# %% + +def create_response_data(df): + results = [] + + for index, row in df.iterrows(): + customerId = row['customerId'] + question = row['question'] + + # Run contoso-chat/chat_request flow to get response + response = get_response(customerId=customerId, question=question, chat_history=[]) + print(response) + + # Add results to list + result = { + 'question': question, + 'context': response["context"], + 'answer': response["answer"] + } + results.append(result) + + # Save results to a JSONL file + with open('result.jsonl', 'w') as file: + for result in results: + file.write(json.dumps(result) + '\n') + return results + +# %% +def evaluate(): + # Evaluate results from results file + results_path = 'result.jsonl' + results = [] + with open(results_path, 'r') as file: + for line in file: + print(line) + results.append(json.loads(line)) + + for result in results: + question = result['question'] + context = result['context'] + answer = result['answer'] + + groundedness_score = groundedness_evaluation(question=question, answer=answer, context=context) + fluency_score = fluency_evaluation(question=question, answer=answer, context=context) + coherence_score = coherence_evaluation(question=question, answer=answer, context=context) + relevance_score = relevance_evaluation(question=question, answer=answer, context=context) + + result['groundedness'] = groundedness_score + result['fluency'] = fluency_score + result['coherence'] = coherence_score + result['relevance'] = relevance_score + + # Save results to a JSONL file + with open('result_evaluated.jsonl', 'w') as file: + for result in results: + file.write(json.dumps(result) + '\n') + + with jsonlines.open('eval_results.jsonl', 'w') as writer: + writer.write(results) + # Print results + + df = pd.read_json('result_evaluated.jsonl', lines=True) + df.head() + + return df + +# %% +def create_summary(df): + print("Evaluation summary:\n") + print(df) + # drop question, context and answer + mean_df = df.drop(["question", "context", "answer"], axis=1).mean() + print("\nAverage scores:") + print(mean_df) + df.to_markdown('eval_results.md') + with open('eval_results.md', 'a') as file: + file.write("\n\nAverages scores:\n\n") + mean_df.to_markdown('eval_results.md', 'a') + + print("Results saved to result_evaluated.jsonl") + +# %% +# create main funciton for python script +if __name__ == "__main__": + + test_data_df = load_data() + response_results = create_response_data(test_data_df) + result_evaluated = evaluate() + create_summary(result_evaluated) + + + diff --git a/src/api/evaluators/custom_evals/coherence.prompty b/src/api/evaluators/custom_evals/coherence.prompty index 696b599a..cca996c8 100644 --- a/src/api/evaluators/custom_evals/coherence.prompty +++ b/src/api/evaluators/custom_evals/coherence.prompty @@ -1,6 +1,6 @@ --- name: QnA Coherence Evaluation -description: Compute the coherence of the answer base on the question using llm. +description: Evaluates coherence score for QA scenario model: api: chat configuration: @@ -22,10 +22,10 @@ sample: context: Track lighting, invented by Lightolier, was popular at one period of time because it was much easier to install than recessed lighting, and individual fixtures are decorative and can be easily aimed at a wall. It has regained some popularity recently in low-voltage tracks, which often look nothing like their predecessors because they do not have the safety issues that line-voltage systems have, and are therefore less bulky and more ornamental in themselves. A master transformer feeds all of the fixtures on the track or rod with 12 or 24 volts, instead of each light fixture having its own line-to-low voltage transformer. There are traditional spots and floods, as well as other small hanging fixtures. A modified version of this is cable lighting, where lights are hung from or clipped to bare metal cables under tension answer: The main transformer is the object that feeds all the fixtures in low voltage tracks. --- -System: -You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. +system: +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information. -User: +user: Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole. Consider the overall quality of the answer when evaluating coherence. Given the question and answer, score the coherence of answer between one to five stars using the following rating scale: One star: the answer completely lacks coherence Two stars: the answer mostly lacks coherence diff --git a/src/api/evaluators/custom_evals/fluency.prompty b/src/api/evaluators/custom_evals/fluency.prompty index 696b599a..77fa0e13 100644 --- a/src/api/evaluators/custom_evals/fluency.prompty +++ b/src/api/evaluators/custom_evals/fluency.prompty @@ -1,6 +1,6 @@ --- -name: QnA Coherence Evaluation -description: Compute the coherence of the answer base on the question using llm. +name: QnA Fluency Evaluation +description: Evaluates fluency score for QA scenario model: api: chat configuration: @@ -22,37 +22,36 @@ sample: context: Track lighting, invented by Lightolier, was popular at one period of time because it was much easier to install than recessed lighting, and individual fixtures are decorative and can be easily aimed at a wall. It has regained some popularity recently in low-voltage tracks, which often look nothing like their predecessors because they do not have the safety issues that line-voltage systems have, and are therefore less bulky and more ornamental in themselves. A master transformer feeds all of the fixtures on the track or rod with 12 or 24 volts, instead of each light fixture having its own line-to-low voltage transformer. There are traditional spots and floods, as well as other small hanging fixtures. A modified version of this is cable lighting, where lights are hung from or clipped to bare metal cables under tension answer: The main transformer is the object that feeds all the fixtures in low voltage tracks. --- -System: -You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. - -User: -Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole. Consider the overall quality of the answer when evaluating coherence. Given the question and answer, score the coherence of answer between one to five stars using the following rating scale: -One star: the answer completely lacks coherence -Two stars: the answer mostly lacks coherence -Three stars: the answer is partially coherent -Four stars: the answer is mostly coherent -Five stars: the answer has perfect coherency +system: +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information. +user: +Fluency measures the quality of individual sentences in the answer, and whether they are well-written and grammatically correct. Consider the quality of individual sentences when evaluating fluency. Given the question and answer, score the fluency of the answer between one to five stars using the following rating scale: +One star: the answer completely lacks fluency +Two stars: the answer mostly lacks fluency +Three stars: the answer is partially fluent +Four stars: the answer is mostly fluent +Five stars: the answer has perfect fluency This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. -question: What is your favorite indoor activity and why do you enjoy it? -answer: I like pizza. The sun is shining. +question: What did you have for breakfast today? +answer: Breakfast today, me eating cereal and orange juice very good. stars: 1 -question: Can you describe your favorite movie without giving away any spoilers? -answer: It is a science fiction movie. There are dinosaurs. The actors eat cake. People must stop the villain. +question: How do you feel when you travel alone? +answer: Alone travel, nervous, but excited also. I feel adventure and like its time. stars: 2 -question: What are some benefits of regular exercise? -answer: Regular exercise improves your mood. A good workout also helps you sleep better. Trees are green. +question: When was the last time you went on a family vacation? +answer: Last family vacation, it took place in last summer. We traveled to a beach destination, very fun. stars: 3 -question: How do you cope with stress in your daily life? -answer: I usually go for a walk to clear my head. Listening to music helps me relax as well. Stress is a part of life, but we can manage it through some activities. +question: What is your favorite thing about your job? +answer: My favorite aspect of my job is the chance to interact with diverse people. I am constantly learning from their experiences and stories. stars: 4 -question: What can you tell me about climate change and its effects on the environment? -answer: Climate change has far-reaching effects on the environment. Rising temperatures result in the melting of polar ice caps, contributing to sea-level rise. Additionally, more frequent and severe weather events, such as hurricanes and heatwaves, can cause disruption to ecosystems and human societies alike. +question: Can you describe your morning routine? +answer: Every morning, I wake up at 6 am, drink a glass of water, and do some light stretching. After that, I take a shower and get dressed for work. Then, I have a healthy breakfast, usually consisting of oatmeal and fruits, before leaving the house around 7:30 am. stars: 5 question: {{question}} diff --git a/src/api/evaluators/custom_evals/groundedness.prompty b/src/api/evaluators/custom_evals/groundedness.prompty index 22abead8..e067b179 100644 --- a/src/api/evaluators/custom_evals/groundedness.prompty +++ b/src/api/evaluators/custom_evals/groundedness.prompty @@ -22,38 +22,31 @@ sample: context: Track lighting, invented by Lightolier, was popular at one period of time because it was much easier to install than recessed lighting, and individual fixtures are decorative and can be easily aimed at a wall. It has regained some popularity recently in low-voltage tracks, which often look nothing like their predecessors because they do not have the safety issues that line-voltage systems have, and are therefore less bulky and more ornamental in themselves. A master transformer feeds all of the fixtures on the track or rod with 12 or 24 volts, instead of each light fixture having its own line-to-low voltage transformer. There are traditional spots and floods, as well as other small hanging fixtures. A modified version of this is cable lighting, where lights are hung from or clipped to bare metal cables under tension answer: The main transformer is the object that feeds all the fixtures in low voltage tracks. --- -System: -You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. -User: -You will be presented with a context and an ANSWER about that context. You need to decide whether the ANSWER is entailed by the context by choosing one of the following rating: -1. 5: The ANSWER follows logically from the information contained in the context. -2. 1: The ANSWER is logically false from the information contained in the context. -3. an integer score between 1 and 5 and if such integer score does not exists, use 1: It is not possible to determine whether the ANSWER is true or false without further information. - -Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the context thoroughly to ensure you know what the context entails. - -Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation. +system: +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information. +user: +You will be presented with a CONTEXT and an ANSWER about that CONTEXT. You need to decide whether the ANSWER is entailed by the CONTEXT by choosing one of the following rating: +1. 5: The ANSWER follows logically from the information contained in the CONTEXT. +2. 1: The ANSWER is logically false from the information contained in the CONTEXT. +3. an integer score between 1 and 5 and if such integer score does not exist, use 1: It is not possible to determine whether the ANSWER is true or false without further information. Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation. Independent Examples: ## Example Task #1 Input: -{"context": "The Academy Awards, also known as the Oscars are awards for artistic and technical merit for the film industry. They are presented annually by the Academy of Motion Picture Arts and Sciences, in recognition of excellence in cinematic achievements as assessed by the Academy's voting membership. The Academy Awards are regarded by many as the most prestigious, significant awards in the entertainment industry in the United States and worldwide.", "ANSWER": "Oscar is presented every other two years"} +{"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."} ## Example Task #1 Output: 1 ## Example Task #2 Input: -{"context": "The Academy Awards, also known as the Oscars are awards for artistic and technical merit for the film industry. They are presented annually by the Academy of Motion Picture Arts and Sciences, in recognition of excellence in cinematic achievements as assessed by the Academy's voting membership. The Academy Awards are regarded by many as the most prestigious, significant awards in the entertainment industry in the United States and worldwide.", "ANSWER": "Oscar is very important awards in the entertainment industry in the United States. And it's also significant worldwide"} +{"CONTEXT": "Ten new television shows appeared during the month of September. Five of the shows were sitcoms, three were hourlong dramas, and two were news-magazine shows. By January, only seven of these new shows were still on the air. Five of the shows that remained were sitcoms.", "QUESTION": "", "ANSWER": "At least one of the shows that were cancelled was an hourlong drama."} ## Example Task #2 Output: 5 ## Example Task #3 Input: -{"context": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."} +{"CONTEXT": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is neither French nor English.", "QUESTION": "", "ANSWER": "In Quebec, an allophone is a resident, usually an immigrant, whose mother tongue or home language is not French."} ## Example Task #3 Output: 5 ## Example Task #4 Input: -{"context": "Some are reported as not having been wanted at all.", "ANSWER": "All are reported as being completely and fully wanted."} +{"CONTEXT": "Some are reported as not having been wanted at all.", "QUESTION": "", "ANSWER": "All are reported as being completely and fully wanted."} ## Example Task #4 Output: 1 - -Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context. - ## Actual Task Input: -question: {{question}} -answer: {{answer}} -stars: \ No newline at end of file +{"CONTEXT": {{context}}, "QUESTION": "", "ANSWER": {{answer}}} +Reminder: The return values for each task should be correctly formatted as an integer between 1 and 5. Do not repeat the context and question. +Actual Task Output: \ No newline at end of file diff --git a/src/api/evaluators/custom_evals/relevance.prompty b/src/api/evaluators/custom_evals/relevance.prompty index 696b599a..9518bbd7 100644 --- a/src/api/evaluators/custom_evals/relevance.prompty +++ b/src/api/evaluators/custom_evals/relevance.prompty @@ -1,6 +1,6 @@ --- -name: QnA Coherence Evaluation -description: Compute the coherence of the answer base on the question using llm. +name: QnA Relevance Evaluation +description: Evaluates relevance score for QA scenario model: api: chat configuration: @@ -22,39 +22,44 @@ sample: context: Track lighting, invented by Lightolier, was popular at one period of time because it was much easier to install than recessed lighting, and individual fixtures are decorative and can be easily aimed at a wall. It has regained some popularity recently in low-voltage tracks, which often look nothing like their predecessors because they do not have the safety issues that line-voltage systems have, and are therefore less bulky and more ornamental in themselves. A master transformer feeds all of the fixtures on the track or rod with 12 or 24 volts, instead of each light fixture having its own line-to-low voltage transformer. There are traditional spots and floods, as well as other small hanging fixtures. A modified version of this is cable lighting, where lights are hung from or clipped to bare metal cables under tension answer: The main transformer is the object that feeds all the fixtures in low voltage tracks. --- -System: -You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. - -User: -Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole. Consider the overall quality of the answer when evaluating coherence. Given the question and answer, score the coherence of answer between one to five stars using the following rating scale: -One star: the answer completely lacks coherence -Two stars: the answer mostly lacks coherence -Three stars: the answer is partially coherent -Four stars: the answer is mostly coherent -Five stars: the answer has perfect coherency +system: +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information. +user: +Relevance measures how well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. Given the context and question, score the relevance of the answer between one to five stars using the following rating scale: +One star: the answer completely lacks relevance +Two stars: the answer mostly lacks relevance +Three stars: the answer is partially relevant +Four stars: the answer is mostly relevant +Five stars: the answer has perfect relevance This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. -question: What is your favorite indoor activity and why do you enjoy it? -answer: I like pizza. The sun is shining. +context: Marie Curie was a Polish-born physicist and chemist who pioneered research on radioactivity and was the first woman to win a Nobel Prize. +question: What field did Marie Curie excel in? +answer: Marie Curie was a renowned painter who focused mainly on impressionist styles and techniques. stars: 1 -question: Can you describe your favorite movie without giving away any spoilers? -answer: It is a science fiction movie. There are dinosaurs. The actors eat cake. People must stop the villain. +context: The Beatles were an English rock band formed in Liverpool in 1960, and they are widely regarded as the most influential music band in history. +question: Where were The Beatles formed? +answer: The band The Beatles began their journey in London, England, and they changed the history of music. stars: 2 -question: What are some benefits of regular exercise? -answer: Regular exercise improves your mood. A good workout also helps you sleep better. Trees are green. +context: The recent Mars rover, Perseverance, was launched in 2020 with the main goal of searching for signs of ancient life on Mars. The rover also carries an experiment called MOXIE, which aims to generate oxygen from the Martian atmosphere. +question: What are the main goals of Perseverance Mars rover mission? +answer: The Perseverance Mars rover mission focuses on searching for signs of ancient life on Mars. stars: 3 -question: How do you cope with stress in your daily life? -answer: I usually go for a walk to clear my head. Listening to music helps me relax as well. Stress is a part of life, but we can manage it through some activities. +context: The Mediterranean diet is a commonly recommended dietary plan that emphasizes fruits, vegetables, whole grains, legumes, lean proteins, and healthy fats. Studies have shown that it offers numerous health benefits, including a reduced risk of heart disease and improved cognitive health. +question: What are the main components of the Mediterranean diet? +answer: The Mediterranean diet primarily consists of fruits, vegetables, whole grains, and legumes. stars: 4 -question: What can you tell me about climate change and its effects on the environment? -answer: Climate change has far-reaching effects on the environment. Rising temperatures result in the melting of polar ice caps, contributing to sea-level rise. Additionally, more frequent and severe weather events, such as hurricanes and heatwaves, can cause disruption to ecosystems and human societies alike. +context: The Queen's Royal Castle is a well-known tourist attraction in the United Kingdom. It spans over 500 acres and contains extensive gardens and parks. The castle was built in the 15th century and has been home to generations of royalty. +question: What are the main attractions of the Queen's Royal Castle? +answer: The main attractions of the Queen's Royal Castle are its expansive 500-acre grounds, extensive gardens, parks, and the historical castle itself, which dates back to the 15th century and has housed generations of royalty. stars: 5 +context: {{context}} question: {{question}} answer: {{answer}} stars: \ No newline at end of file diff --git a/src/api/evaluators/evaluate-chat-flow.ipynb b/src/api/evaluators/evaluate-chat-flow.ipynb deleted file mode 100644 index c14b7301..00000000 --- a/src/api/evaluators/evaluate-chat-flow.ipynb +++ /dev/null @@ -1,166 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import json\n", - "import prompty\n", - "from custom_evals.coherence import coherence_evaluation\n", - "from custom_evals.relevance import relevance_evaluation\n", - "from custom_evals.fluency import fluency_evaluation\n", - "from custom_evals.groundedness import groundedness_evaluation\n", - "import jsonlines" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get output from data and save to results jsonl file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "data_path = \"data.jsonl\"\n", - "\n", - "df = pd.read_json(data_path, lines=True)\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import python file from /workspaces/contoso-chat/src/api/contoso_chat/chat_request.py\n", - "import sys\n", - "sys.path.append('/workspaces/contoso-chat/src/api/contoso_chat')\n", - "from chat_request import get_response\n", - "\n", - "results = []\n", - "\n", - "for index, row in df.iterrows():\n", - " customerId = row['customerId']\n", - " question = row['question']\n", - " \n", - " # Run contoso-chat/chat_request flow to get response\n", - " response = get_response(customerId=customerId, question=question, chat_history=[])\n", - " print(response)\n", - " \n", - " # Add results to list\n", - " result = {\n", - " 'question': question,\n", - " 'context': response[\"context\"],\n", - " 'answer': response[\"answer\"]\n", - " }\n", - " results.append(result)\n", - "\n", - "# Save results to a JSONL file\n", - "with open('result.jsonl', 'w') as file:\n", - " for result in results:\n", - " file.write(json.dumps(result) + '\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Evaluate results from results file\n", - "results_path = 'result.jsonl'\n", - "results = []\n", - "with open(results_path, 'r') as file:\n", - " for line in file:\n", - " print(line)\n", - " results.append(json.loads(line))\n", - "\n", - "for result in results:\n", - " question = result['question']\n", - " context = result['context']\n", - " answer = result['answer']\n", - " \n", - " groundedness_score = groundedness_evaluation(question=question, answer=answer, context=context)\n", - " fluency_score = fluency_evaluation(question=question, answer=answer, context=context)\n", - " coherence_score = coherence_evaluation(question=question, answer=answer, context=context)\n", - " relevance_score = relevance_evaluation(question=question, answer=answer, context=context)\n", - " \n", - " result['groundedness'] = groundedness_score\n", - " result['fluency'] = fluency_score\n", - " result['coherence'] = coherence_score\n", - " result['relevance'] = relevance_score\n", - "\n", - "# Save results to a JSONL file\n", - "with open('result_evaluated.jsonl', 'w') as file:\n", - " for result in results:\n", - " file.write(json.dumps(result) + '\\n')\n", - "\n", - "# Print results\n", - "\n", - "df = pd.read_json('result_evaluated.jsonl', lines=True)\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Evaluation summary:\\n\")\n", - "print(df)\n", - "# drop question, context and answer\n", - "mean_df = df.drop([\"question\", \"context\", \"answer\"], axis=1).mean()\n", - "print(\"\\nAverage scores:\")\n", - "print(mean_df)\n", - "df.to_markdown('eval_results.md')\n", - "with open('eval_results.md', 'a') as file:\n", - " file.write(\"\\n\\nAverages scores:\\n\\n\")\n", - "mean_df.to_markdown('eval_results.md', 'a')\n", - "with jsonlines.open('eval_results.jsonl', 'w') as writer:\n", - " writer.write(results)\n", - "\n", - "print(\"Results saved to result_evaluated.jsonl\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pf-prompty", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/src/api/evaluators/evaluate.py b/src/api/evaluators/evaluate.py deleted file mode 100644 index 1eab9ba0..00000000 --- a/src/api/evaluators/evaluate.py +++ /dev/null @@ -1,104 +0,0 @@ -# %% -import os -import json -import prompty -from custom_evals.coherence import coherence_evaluation -from custom_evals.relevance import relevance_evaluation -from custom_evals.fluency import fluency_evaluation -from custom_evals.groundedness import groundedness_evaluation -import jsonlines - -# %% [markdown] -# ## Get output from data and save to results jsonl file - -# %% -import pandas as pd - -data_path = "data.jsonl" - -df = pd.read_json(data_path, lines=True) -df.head() - -# %% -# import python file from /workspaces/contoso-chat/src/api/contoso_chat/chat_request.py -import sys -sys.path.append('/workspaces/contoso-chat/src/api/contoso_chat') -from chat_request import get_response - -results = [] - -for index, row in df.iterrows(): - customerId = row['customerId'] - question = row['question'] - - # Run contoso-chat/chat_request flow to get response - response = get_response(customerId=customerId, question=question, chat_history=[]) - print(response) - - # Add results to list - result = { - 'question': question, - 'context': response["context"], - 'answer': response["answer"] - } - results.append(result) - -# Save results to a JSONL file -with open('result.jsonl', 'w') as file: - for result in results: - file.write(json.dumps(result) + '\n') - -# %% -# Evaluate results from results file -results_path = 'result.jsonl' -results = [] -with open(results_path, 'r') as file: - for line in file: - print(line) - results.append(json.loads(line)) - -for result in results: - question = result['question'] - context = result['context'] - answer = result['answer'] - - groundedness_score = groundedness_evaluation(question=question, answer=answer, context=context) - fluency_score = fluency_evaluation(question=question, answer=answer, context=context) - coherence_score = coherence_evaluation(question=question, answer=answer, context=context) - relevance_score = relevance_evaluation(question=question, answer=answer, context=context) - - result['groundedness'] = groundedness_score - result['fluency'] = fluency_score - result['coherence'] = coherence_score - result['relevance'] = relevance_score - -# Save results to a JSONL file -with open('result_evaluated.jsonl', 'w') as file: - for result in results: - file.write(json.dumps(result) + '\n') - -# Print results - -df = pd.read_json('result_evaluated.jsonl', lines=True) -df.head() - -# %% -print("Evaluation summary:\n") -print(df) -# drop question, context and answer -mean_df = df.drop(["question", "context", "answer"], axis=1).mean() -print("\nAverage scores:") -print(mean_df) -df.to_markdown('eval_results.md') -with open('eval_results.md', 'a') as file: - file.write("\n\nAverages scores:\n\n") -mean_df.to_markdown('eval_results.md', 'a') -with jsonlines.open('eval_results.jsonl', 'w') as writer: - writer.write(results) - -print("Results saved to result_evaluated.jsonl") - -# %% - - -