Merge pull request #66 from cvs-health/db/enforce_strings

enforce strings in response outputs, return response-level cf scores
cvs-health · Dec 17, 2024 · c7aa90c · c7aa90c
2 parents f87e154 + cc511b3
commit c7aa90c
Show file tree

Hide file tree

Showing 7 changed files with 91 additions and 55 deletions.
diff --git a/README.md b/README.md
@@ -50,8 +50,8 @@ We can use `ResponseGenerator.generate_responses` to generate 25 responses for e
 from langfair.generator import ResponseGenerator
 rg = ResponseGenerator(langchain_llm=llm)
 generations = await rg.generate_responses(prompts=prompts, count=25)
-responses = [str(r) for r in generations["data"]["response"]]
-duplicated_prompts = [str(r) for r in generations["data"]["prompt"]] # so prompts correspond to responses
+responses = generations["data"]["response"]
+duplicated_prompts = generations["data"]["prompt"] # so prompts correspond to responses
 ```
 
 ##### Compute toxicity metrics
@@ -96,8 +96,8 @@ cg = CounterfactualGenerator(langchain_llm=llm)
 cf_generations = await cg.generate_responses(
     prompts=prompts, attribute='gender', count=25
 )
-male_responses = [str(r) for r in cf_generations['data']['male_response']]
-female_responses = [str(r) for r in cf_generations['data']['female_response']]
+male_responses = cf_generations['data']['male_response']
+female_responses = cf_generations['data']['female_response']
 ```
 
 Counterfactual metrics can be easily computed with `CounterfactualMetrics`.
@@ -109,7 +109,7 @@ cf_result = cm.evaluate(
     texts2=female_responses,
     attribute='gender'
 )
-cf_result
+cf_result['metrics']
 # # Output is below
 # {'Cosine Similarity': 0.8318708,
 # 'RougeL Similarity': 0.5195852482361165,

diff --git a/examples/evaluations/text_generation/counterfactual_metrics_demo.ipynb b/examples/evaluations/text_generation/counterfactual_metrics_demo.ipynb
@@ -536,10 +536,10 @@
        "      <td>the black casting will be a first</td>\n",
        "      <td>the hispanic casting will be a first</td>\n",
        "      <td>the asian casting will be a first</td>\n",
-       "      <td>The white casting for what will be a first? Ca...</td>\n",
-       "      <td>That's great to hear! It's always exciting to ...</td>\n",
-       "      <td>That's great news! Diversity and inclusivity i...</td>\n",
-       "      <td>Indeed, the Asian casting will mark a signific...</td>\n",
+       "      <td>That's exciting! It's great to see diversity a...</td>\n",
+       "      <td>That's great to hear! It's always exciting whe...</td>\n",
+       "      <td>That's great to hear! The inclusion of Hispani...</td>\n",
+       "      <td>That's great to hear! Diversity and representa...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -553,16 +553,16 @@
        "0  the hispanic casting will be a first  the asian casting will be a first   \n",
        "\n",
        "                                      white_response  \\\n",
-       "0  The white casting for what will be a first? Ca...   \n",
+       "0  That's exciting! It's great to see diversity a...   \n",
        "\n",
        "                                      black_response  \\\n",
-       "0  That's great to hear! It's always exciting to ...   \n",
+       "0  That's great to hear! It's always exciting whe...   \n",
        "\n",
        "                                   hispanic_response  \\\n",
-       "0  That's great news! Diversity and inclusivity i...   \n",
+       "0  That's great to hear! The inclusion of Hispani...   \n",
        "\n",
        "                                      asian_response  \n",
-       "0  Indeed, the Asian casting will mark a signific...  "
+       "0  That's great to hear! Diversity and representa...  "
       ]
      },
      "execution_count": 10,
@@ -637,6 +637,7 @@
     "\n",
     "    - `texts1` - (**List of strings**) A list of generated output from an LLM with mention of a protected attribute group.\n",
     "    - `texts2` - (**List of strings**) A list of equal length to `texts1` containing counterfactually generated output from an LLM with mention of a different protected attribute group.\n",
+    "    - `return_data` - (**bool, default=False**) Indicates whether to include response-level counterfactual scores in results dictionary returned by this method.\n",
     "\n",
     "    Returns:\n",
     "    - A dictionary containing all Counterfactual metric values (**dict**)."
@@ -665,35 +666,35 @@
      "output_type": "stream",
      "text": [
       "1. white-black\n",
-      "\t-  Cosine Similarity : 0.40690\n",
-      "\t-  RougeL Similarity : 0.14930\n",
-      "\t-  Bleu Similarity : 0.04831\n",
-      "\t-  Sentiment Bias : 0.01677\n",
+      "\t-  Cosine Similarity : 0.52405\n",
+      "\t-  RougeL Similarity : 0.26143\n",
+      "\t-  Bleu Similarity : 0.11349\n",
+      "\t-  Sentiment Bias : 0.02045\n",
       "2. white-asian\n",
-      "\t-  Cosine Similarity : 0.36904\n",
-      "\t-  RougeL Similarity : 0.15092\n",
-      "\t-  Bleu Similarity : 0.04156\n",
-      "\t-  Sentiment Bias : 0.01226\n",
+      "\t-  Cosine Similarity : 0.51903\n",
+      "\t-  RougeL Similarity : 0.25024\n",
+      "\t-  Bleu Similarity : 0.10622\n",
+      "\t-  Sentiment Bias : 0.02540\n",
       "3. white-hispanic\n",
-      "\t-  Cosine Similarity : 0.40441\n",
-      "\t-  RougeL Similarity : 0.15000\n",
-      "\t-  Bleu Similarity : 0.04974\n",
-      "\t-  Sentiment Bias : 0.02372\n",
+      "\t-  Cosine Similarity : 0.51375\n",
+      "\t-  RougeL Similarity : 0.26624\n",
+      "\t-  Bleu Similarity : 0.11991\n",
+      "\t-  Sentiment Bias : 0.01062\n",
       "4. black-asian\n",
-      "\t-  Cosine Similarity : 0.38779\n",
-      "\t-  RougeL Similarity : 0.14893\n",
-      "\t-  Bleu Similarity : 0.04764\n",
-      "\t-  Sentiment Bias : 0.00757\n",
+      "\t-  Cosine Similarity : 0.49728\n",
+      "\t-  RougeL Similarity : 0.28346\n",
+      "\t-  Bleu Similarity : 0.13336\n",
+      "\t-  Sentiment Bias : 0.02770\n",
       "5. black-hispanic\n",
-      "\t-  Cosine Similarity : 0.40876\n",
-      "\t-  RougeL Similarity : 0.16009\n",
-      "\t-  Bleu Similarity : 0.05369\n",
-      "\t-  Sentiment Bias : 0.01615\n",
+      "\t-  Cosine Similarity : 0.49226\n",
+      "\t-  RougeL Similarity : 0.26678\n",
+      "\t-  Bleu Similarity : 0.13220\n",
+      "\t-  Sentiment Bias : 0.02677\n",
       "6. asian-hispanic\n",
-      "\t-  Cosine Similarity : 0.35655\n",
-      "\t-  RougeL Similarity : 0.16174\n",
-      "\t-  Bleu Similarity : 0.06154\n",
-      "\t-  Sentiment Bias : 0.01745\n"
+      "\t-  Cosine Similarity : 0.53258\n",
+      "\t-  RougeL Similarity : 0.27101\n",
+      "\t-  Bleu Similarity : 0.12291\n",
+      "\t-  Sentiment Bias : 0.03391\n"
      ]
     }
    ],
@@ -702,7 +703,8 @@
     "keys_, count = [], 1\n",
     "for group1, group2 in combinations(['white','black','asian','hispanic'], 2):\n",
     "    keys_.append(f\"{group1}-{group2}\")\n",
-    "    similarity_values[keys_[-1]] = counterfactual.evaluate(race_eval_df[group1 + '_response'],race_eval_df[group2 + '_response'], attribute=\"race\")\n",
+    "    result = counterfactual.evaluate(race_eval_df[group1 + '_response'],race_eval_df[group2 + '_response'], attribute=\"race\")\n",
+    "    similarity_values[keys_[-1]] = result['metrics']\n",
     "    print(f\"{count}. {group1}-{group2}\")\n",
     "    for key_ in similarity_values[keys_[-1]]:\n",
     "        print(f\"\\t- \", key_, \": {:1.5f}\".format(similarity_values[keys_[-1]][key_]))\n",
@@ -1092,15 +1094,15 @@
  ],
  "metadata": {
   "environment": {
-   "kernel": "langfair-test",
+   "kernel": "langchain",
    "name": "workbench-notebooks.m125",
    "type": "gcloud",
    "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/workbench-notebooks:m125"
   },
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "langchain",
    "language": "python",
-   "name": "python3"
+   "name": "langchain"
   },
   "language_info": {
    "codemirror_mode": {
@@ -1112,7 +1114,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.6"
+   "version": "3.11.10"
   }
  },
  "nbformat": 4,

diff --git a/langfair/constants/word_lists.py b/langfair/constants/word_lists.py
@@ -64,6 +64,9 @@
     "ladies",
     "grandmother",
     "grandmothers",
+    "girfriend",
+    "girlfriends",
+    "Mrs."
 ]
 
 MALE_WORDS: List[str] = [
@@ -91,6 +94,9 @@
     "gentlemen",
     "grandfather",
     "grandfathers",
+    "boyfriend",
+    "boyfriends",
+    "Mr."
 ]
 
 GENDER_NEUTRAL_WORDS: List[str] = [
@@ -118,6 +124,9 @@
     "people",
     "grandparent",
     "grandparents",
+    "friend",
+    "friends",
+    "Mx."
 ]
 
 GENDER_TO_WORD_LISTS: Dict[str, List[str]] = {

diff --git a/langfair/generator/counterfactual.py b/langfair/generator/counterfactual.py
@@ -383,7 +383,8 @@ async def generate_responses(
                 tasks,
                 duplicated_prompts_dict[prompt_key],
             ) = self._create_tasks(chain=chain, prompts=prompts_dict[prompt_key])
-            responses_dict[group + "_response"] = await asyncio.gather(*tasks)
+            tmp_responses = await asyncio.gather(*tasks)
+            responses_dict[group + "_response"] = self._enforce_strings(tmp_responses)
             # stop = time.time()
 
         non_completion_rate = len(

diff --git a/langfair/generator/generator.py b/langfair/generator/generator.py
@@ -236,8 +236,8 @@ async def generate_responses(
         print("Responses successfully generated!")
         return {
             "data": {
-                "prompt": duplicated_prompts,
-                "response": responses,
+                "prompt": self._enforce_strings(duplicated_prompts),
+                "response": self._enforce_strings(responses),
             },
             "metadata": {
                 "non_completion_rate": non_completion_rate,
@@ -306,6 +306,11 @@ def _valid_exceptions(
             except Exception:
                 return False
 
+    @staticmethod
+    def _enforce_strings(texts: List[Any]) -> List[str]:
+        """Enforce that all outputs are strings"""
+        return [str(r) for r in texts]
+
     @staticmethod
     def _num_tokens_from_messages(
         messages: List[Dict[str, str]], model: str, prompt: bool = True

diff --git a/langfair/metrics/counterfactual/counterfactual.py b/langfair/metrics/counterfactual/counterfactual.py
@@ -12,18 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Union
+from typing import Any, Dict, Union
+
+import numpy as np
 
 from langfair.generator.counterfactual import CounterfactualGenerator
 from langfair.metrics.counterfactual import metrics
 from langfair.metrics.counterfactual.metrics.baseclass.metrics import Metric
 
 MetricType = Union[list[str], list[Metric]]
 DefaultMetricObjects = {
-    "Cosine": metrics.CosineSimilarity(transformer="all-MiniLM-L6-v2"),
-    "Rougel": metrics.RougelSimilarity(),
-    "Bleu": metrics.BleuSimilarity(),
-    "Sentiment Bias": metrics.SentimentBias(),
+    "Cosine": metrics.CosineSimilarity(transformer="all-MiniLM-L6-v2", how='pairwise'),
+    "Rougel": metrics.RougelSimilarity(how='pairwise'),
+    "Bleu": metrics.BleuSimilarity(how='pairwise'),
+    "Sentiment Bias": metrics.SentimentBias(how='pairwise'),
 }
 DefaultMetricNames = list(DefaultMetricObjects.keys())
 
@@ -57,7 +59,13 @@ def __init__(
         if self.neutralize_tokens:
             self.cf_generator = CounterfactualGenerator()
 
-    def evaluate(self, texts1: list, texts2: list, attribute: str = None):
+    def evaluate(
+        self, 
+        texts1: list, 
+        texts2: list, 
+        attribute: str = None, 
+        return_data: bool = False
+    ) -> Dict[str, Any]:
         """
         This method evaluate the counterfactual metrics values for the provided pair of texts.
 
@@ -75,6 +83,9 @@ def evaluate(self, texts1: list, texts2: list, attribute: str = None):
         attribute : {'gender', 'race'}, default='gender'
             Specifies whether to use race or gender for neutralization
 
+        return_data : bool, default=False
+            Indicates whether to include response-level counterfactual scores in results dictionary returned by this method.
+
         Returns
         -------
         dict
@@ -97,19 +108,26 @@ def evaluate(self, texts1: list, texts2: list, attribute: str = None):
                 texts=texts2, attribute=attribute
             )
         metric_values = {}
+        response_scores = {"texts1": texts1, "texts2": texts2}
         for metric in self.metrics:
             if (
                 metric.name in ["Bleu Similarity", "RougeL Similarity"]
                 and self.neutralize_tokens
             ):
-                metric_values[metric.name] = metric.evaluate(
+                scores = metric.evaluate(
                     texts1=masked_texts1, texts2=masked_texts2
                 )
             else:
-                metric_values[metric.name] = metric.evaluate(
+                scores = metric.evaluate(
                     texts1=texts1, texts2=texts2
                 )
-        return metric_values
+            response_scores[metric.name] = scores
+            metric_values[metric.name] = np.mean(scores)
+
+        result = {"metrics": metric_values}
+        if return_data:
+            result["data"] = response_scores
+        return result
 
     def _default_instances(self):
         """Define default metrics."""

diff --git a/tests/test_counterfactual_metrics.py b/tests/test_counterfactual_metrics.py
@@ -66,9 +66,10 @@ def test_CounterfactualMetrics():
         "Sentiment Bias",
     ]
     counterfactualmetrics = CounterfactualMetrics(metrics=metrics)
-    score = counterfactualmetrics.evaluate(
+    result = counterfactualmetrics.evaluate(
         data["text1"], data["text2"], attribute="race"
     )
+    score = result['metrics']
     ans = actual_results["test6"]
     assert all(
         [abs(score[key] - ans[key]) < 1e-5 for key in ans if key != "Cosine Similarity"]