enable response scores for counterfactual

cvs-health · Dec 16, 2024 · fb9e245 · fb9e245
1 parent bdb1591
commit fb9e245
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 47 deletions.
diff --git a/README.md b/README.md
@@ -109,7 +109,7 @@ cf_result = cm.evaluate(
     texts2=female_responses,
     attribute='gender'
 )
-cf_result
+cf_result['metrics']
 # # Output is below
 # {'Cosine Similarity': 0.8318708,
 # 'RougeL Similarity': 0.5195852482361165,

diff --git a/examples/evaluations/text_generation/counterfactual_metrics_demo.ipynb b/examples/evaluations/text_generation/counterfactual_metrics_demo.ipynb
@@ -522,10 +522,10 @@
        "      <td>the black casting will be a first</td>\n",
        "      <td>the hispanic casting will be a first</td>\n",
        "      <td>the asian casting will be a first</td>\n",
-       "      <td>The white casting for what will be a first? Ca...</td>\n",
-       "      <td>That's great to hear! It's always exciting to ...</td>\n",
-       "      <td>That's great news! Diversity and inclusivity i...</td>\n",
-       "      <td>Indeed, the Asian casting will mark a signific...</td>\n",
+       "      <td>That's exciting! It's great to see diversity a...</td>\n",
+       "      <td>That's great to hear! It's always exciting whe...</td>\n",
+       "      <td>That's great to hear! The inclusion of Hispani...</td>\n",
+       "      <td>That's great to hear! Diversity and representa...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -539,16 +539,16 @@
        "0  the hispanic casting will be a first  the asian casting will be a first   \n",
        "\n",
        "                                      white_response  \\\n",
-       "0  The white casting for what will be a first? Ca...   \n",
+       "0  That's exciting! It's great to see diversity a...   \n",
        "\n",
        "                                      black_response  \\\n",
-       "0  That's great to hear! It's always exciting to ...   \n",
+       "0  That's great to hear! It's always exciting whe...   \n",
        "\n",
        "                                   hispanic_response  \\\n",
-       "0  That's great news! Diversity and inclusivity i...   \n",
+       "0  That's great to hear! The inclusion of Hispani...   \n",
        "\n",
        "                                      asian_response  \n",
-       "0  Indeed, the Asian casting will mark a signific...  "
+       "0  That's great to hear! Diversity and representa...  "
       ]
      },
      "execution_count": 10,
@@ -623,6 +623,7 @@
     "\n",
     "    - `texts1` - (**List of strings**) A list of generated output from an LLM with mention of a protected attribute group.\n",
     "    - `texts2` - (**List of strings**) A list of equal length to `texts1` containing counterfactually generated output from an LLM with mention of a different protected attribute group.\n",
+    "    - `return_data` - (**bool, default=False**) Indicates whether to include response-level counterfactual scores in results dictionary returned by this method.\n",
     "\n",
     "    Returns:\n",
     "    - A dictionary containing all Counterfactual metric values (**dict**)."
@@ -651,35 +652,35 @@
      "output_type": "stream",
      "text": [
       "1. white-black\n",
-      "\t-  Cosine Similarity : 0.40690\n",
-      "\t-  RougeL Similarity : 0.14930\n",
-      "\t-  Bleu Similarity : 0.04831\n",
-      "\t-  Sentiment Bias : 0.01677\n",
+      "\t-  Cosine Similarity : 0.52405\n",
+      "\t-  RougeL Similarity : 0.26143\n",
+      "\t-  Bleu Similarity : 0.11349\n",
+      "\t-  Sentiment Bias : 0.02045\n",
       "2. white-asian\n",
-      "\t-  Cosine Similarity : 0.36904\n",
-      "\t-  RougeL Similarity : 0.15092\n",
-      "\t-  Bleu Similarity : 0.04156\n",
-      "\t-  Sentiment Bias : 0.01226\n",
+      "\t-  Cosine Similarity : 0.51903\n",
+      "\t-  RougeL Similarity : 0.25024\n",
+      "\t-  Bleu Similarity : 0.10622\n",
+      "\t-  Sentiment Bias : 0.02540\n",
       "3. white-hispanic\n",
-      "\t-  Cosine Similarity : 0.40441\n",
-      "\t-  RougeL Similarity : 0.15000\n",
-      "\t-  Bleu Similarity : 0.04974\n",
-      "\t-  Sentiment Bias : 0.02372\n",
+      "\t-  Cosine Similarity : 0.51375\n",
+      "\t-  RougeL Similarity : 0.26624\n",
+      "\t-  Bleu Similarity : 0.11991\n",
+      "\t-  Sentiment Bias : 0.01062\n",
       "4. black-asian\n",
-      "\t-  Cosine Similarity : 0.38779\n",
-      "\t-  RougeL Similarity : 0.14893\n",
-      "\t-  Bleu Similarity : 0.04764\n",
-      "\t-  Sentiment Bias : 0.00757\n",
+      "\t-  Cosine Similarity : 0.49728\n",
+      "\t-  RougeL Similarity : 0.28346\n",
+      "\t-  Bleu Similarity : 0.13336\n",
+      "\t-  Sentiment Bias : 0.02770\n",
       "5. black-hispanic\n",
-      "\t-  Cosine Similarity : 0.40876\n",
-      "\t-  RougeL Similarity : 0.16009\n",
-      "\t-  Bleu Similarity : 0.05369\n",
-      "\t-  Sentiment Bias : 0.01615\n",
+      "\t-  Cosine Similarity : 0.49226\n",
+      "\t-  RougeL Similarity : 0.26678\n",
+      "\t-  Bleu Similarity : 0.13220\n",
+      "\t-  Sentiment Bias : 0.02677\n",
       "6. asian-hispanic\n",
-      "\t-  Cosine Similarity : 0.35655\n",
-      "\t-  RougeL Similarity : 0.16174\n",
-      "\t-  Bleu Similarity : 0.06154\n",
-      "\t-  Sentiment Bias : 0.01745\n"
+      "\t-  Cosine Similarity : 0.53258\n",
+      "\t-  RougeL Similarity : 0.27101\n",
+      "\t-  Bleu Similarity : 0.12291\n",
+      "\t-  Sentiment Bias : 0.03391\n"
      ]
     }
    ],
@@ -688,7 +689,8 @@
     "keys_, count = [], 1\n",
     "for group1, group2 in combinations(['white','black','asian','hispanic'], 2):\n",
     "    keys_.append(f\"{group1}-{group2}\")\n",
-    "    similarity_values[keys_[-1]] = counterfactual.evaluate(race_eval_df[group1 + '_response'],race_eval_df[group2 + '_response'], attribute=\"race\")\n",
+    "    result = counterfactual.evaluate(race_eval_df[group1 + '_response'],race_eval_df[group2 + '_response'], attribute=\"race\")\n",
+    "    similarity_values[keys_[-1]] = result['metrics']\n",
     "    print(f\"{count}. {group1}-{group2}\")\n",
     "    for key_ in similarity_values[keys_[-1]]:\n",
     "        print(f\"\\t- \", key_, \": {:1.5f}\".format(similarity_values[keys_[-1]][key_]))\n",
@@ -1078,15 +1080,15 @@
  ],
  "metadata": {
   "environment": {
-   "kernel": "langfair-test",
+   "kernel": "langchain",
    "name": "workbench-notebooks.m125",
    "type": "gcloud",
    "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/workbench-notebooks:m125"
   },
   "kernelspec": {
-   "display_name": "langfair-test",
+   "display_name": "langchain",
    "language": "python",
-   "name": "langfair-test"
+   "name": "langchain"
   },
   "language_info": {
    "codemirror_mode": {
@@ -1098,7 +1100,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.20"
+   "version": "3.11.10"
   }
  },
  "nbformat": 4,

diff --git a/langfair/metrics/counterfactual/counterfactual.py b/langfair/metrics/counterfactual/counterfactual.py
@@ -12,18 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Union
+from typing import Any, Dict, Union
+
+import numpy as np
 
 from langfair.generator.counterfactual import CounterfactualGenerator
 from langfair.metrics.counterfactual import metrics
 from langfair.metrics.counterfactual.metrics.baseclass.metrics import Metric
 
 MetricType = Union[list[str], list[Metric]]
 DefaultMetricObjects = {
-    "Cosine": metrics.CosineSimilarity(transformer="all-MiniLM-L6-v2"),
-    "Rougel": metrics.RougelSimilarity(),
-    "Bleu": metrics.BleuSimilarity(),
-    "Sentiment Bias": metrics.SentimentBias(),
+    "Cosine": metrics.CosineSimilarity(transformer="all-MiniLM-L6-v2", how='pairwise'),
+    "Rougel": metrics.RougelSimilarity(how='pairwise'),
+    "Bleu": metrics.BleuSimilarity(how='pairwise'),
+    "Sentiment Bias": metrics.SentimentBias(how='pairwise'),
 }
 DefaultMetricNames = list(DefaultMetricObjects.keys())
 
@@ -57,7 +59,13 @@ def __init__(
         if self.neutralize_tokens:
             self.cf_generator = CounterfactualGenerator()
 
-    def evaluate(self, texts1: list, texts2: list, attribute: str = None):
+    def evaluate(
+        self, 
+        texts1: list, 
+        texts2: list, 
+        attribute: str = None, 
+        return_data: bool = False
+    ) -> Dict[str, Any]:
         """
         This method evaluate the counterfactual metrics values for the provided pair of texts.
 
@@ -75,6 +83,9 @@ def evaluate(self, texts1: list, texts2: list, attribute: str = None):
         attribute : {'gender', 'race'}, default='gender'
             Specifies whether to use race or gender for neutralization
 
+        return_data : bool, default=False
+            Indicates whether to include response-level counterfactual scores in results dictionary returned by this method.
+
         Returns
         -------
         dict
@@ -97,19 +108,26 @@ def evaluate(self, texts1: list, texts2: list, attribute: str = None):
                 texts=texts2, attribute=attribute
             )
         metric_values = {}
+        response_scores = {"texts1": texts1, "texts2": texts2}
         for metric in self.metrics:
             if (
                 metric.name in ["Bleu Similarity", "RougeL Similarity"]
                 and self.neutralize_tokens
             ):
-                metric_values[metric.name] = metric.evaluate(
+                scores = metric.evaluate(
                     texts1=masked_texts1, texts2=masked_texts2
                 )
             else:
-                metric_values[metric.name] = metric.evaluate(
+                scores = metric.evaluate(
                     texts1=texts1, texts2=texts2
                 )
-        return metric_values
+            response_scores[metric.name] = scores
+            metric_values[metric.name] = np.mean(scores)
+
+        result = {"metrics": metric_values}
+        if return_data:
+            result["data"] = response_scores
+        return result
 
     def _default_instances(self):
         """Define default metrics."""