diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000..09cf864
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,24 @@
+## Description
+<!--- Provide a general summary of your changes. -->
+<!--- Mention related issues, pull requests, or discussions with #<issue/PR/discussion ID>. -->
+<!--- Tag people for whom this PR may be of interest using @<username>. -->
+
+## Contributor License Agreement
+<!--- Select all that apply by putting an x between the brackets: [x] -->
+- [ ] confirm you have signed the [LangFair CLA](https://forms.office.com/pages/responsepage.aspx?id=uGG7-v46dU65NKR_eCuM1xbiih2MIwxBuRvO0D_wqVFUMlFIVFdYVFozN1BJVjVBRUdMUUY5UU9QRS4u&route=shorturl)
+
+## Tests
+<!--- Select all that apply by putting an x between the brackets: [x] -->
+- [ ] no new tests required
+- [ ] new tests added
+- [ ] existing tests adjusted
+
+## Documentation
+<!--- Select all that apply by putting an x between the brackets: [x] -->
+- [ ] no documentation changes needed
+- [ ] README updated
+- [ ] API docs added or updated
+- [ ] example notebook added or updated
+
+## Screenshots
+<!--- If applicable, please add screenshots. -->
\ No newline at end of file
diff --git a/README.md b/README.md
index 8bba95a..43d8725 100644
--- a/README.md
+++ b/README.md
@@ -128,7 +128,7 @@ auto_object = AutoEval(
 )
 results = await auto_object.evaluate()
 results['metrics']
-# Output is below
+# # Output is below
 # {'Toxicity': {'Toxic Fraction': 0.0004,
 #   'Expected Maximum Toxicity': 0.013845130120171235,
 #   'Toxicity Probability': 0.01},
@@ -199,7 +199,7 @@ Bias and fairness metrics offered by LangFair are grouped into several categorie
 
 
 ## 📖 Associated Research
-A technical description of LangFair's evaluation metrics and a practitioner's guide for selecting evaluation metrics is contained in **[this paper](https://arxiv.org/abs/2407.10853)**. If you use our framework for selecting evaluation metrics, we would appreciate citations to the following paper:
+A technical description and a practitioner's guide for selecting evaluation metrics is contained in **[this paper](https://arxiv.org/abs/2407.10853)**. If you use our evaluation approach, we would appreciate citations to the following paper:
 
 ```bibtex
 @misc{bouchard2024actionableframeworkassessingbias,
@@ -213,6 +213,20 @@ A technical description of LangFair's evaluation metrics and a practitioner's gu
 }
 ```
 
+A high-level description of LangFair's functionality is contained in **[this paper](https://arxiv.org/abs/2501.03112)**. If you use LangFair, we would appreciate citations to the following paper:
+
+```bibtex
+@misc{bouchard2025langfairpythonpackageassessing,
+      title={LangFair: A Python Package for Assessing Bias and Fairness in Large Language Model Use Cases}, 
+      author={Dylan Bouchard and Mohit Singh Chauhan and David Skarbrevik and Viren Bajaj and Zeya Ahmad},
+      year={2025},
+      eprint={2501.03112},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2501.03112}, 
+}
+```
+
 ## 📄 Code Documentation
 Please refer to our [documentation site](https://cvs-health.github.io/langfair/) for more details on how to use LangFair.
 
diff --git a/examples/evaluations/text_generation/auto_eval_demo.ipynb b/examples/evaluations/text_generation/auto_eval_demo.ipynb
index 66fb79d..e091d1d 100644
--- a/examples/evaluations/text_generation/auto_eval_demo.ipynb
+++ b/examples/evaluations/text_generation/auto_eval_demo.ipynb
@@ -11,7 +11,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This notebook demonstrate the implementation of `AutoEval` class. This class provides an user-friendly way to compute toxicity, stereotype, and counterfactual assessment for an LLM model. The user needs to provide the input prompts and model responses (optional) and the `AutoEval` class implement following steps.\n",
+    "This notebook demonstrate the implementation of `AutoEval` class. This class provides an user-friendly way to compute toxicity, stereotype, and counterfactual assessment for an LLM use case. The user needs to provide the input prompts and a `langchain` LLM, and the `AutoEval` class implements following steps.\n",
     "\n",
     "1. Check Fairness Through Awareness (FTU)\n",
     "2. If FTU is not satisfied, generate dataset for Counterfactual assessment \n",
@@ -61,7 +61,6 @@
    "outputs": [],
    "source": [
     "# User to populate .env file with API credentials\n",
-    "repo_path = '/'.join(os.getcwd().split('/')[:-3])\n",
     "load_dotenv(find_dotenv())\n",
     "\n",
     "API_KEY = os.getenv('API_KEY')\n",
diff --git a/langfair/generator/counterfactual.py b/langfair/generator/counterfactual.py
index 6803002..15363fc 100644
--- a/langfair/generator/counterfactual.py
+++ b/langfair/generator/counterfactual.py
@@ -334,10 +334,18 @@ async def generate_responses(
         ----------
         dict
             A dictionary with two keys: 'data' and 'metadata'.
+
             'data' : dict
                 A dictionary containing the prompts and responses.
+                
+                'prompt' : list
+                    A list of prompts.
+                'response' : list
+                    A list of responses corresponding to the prompts.
+
             'metadata' : dict
                 A dictionary containing metadata about the generation process.
+
                 'non_completion_rate' : float
                     The rate at which the generation process did not complete.
                 'temperature' : float
@@ -433,16 +441,22 @@ def check_ftu(
         -------
         dict
             A dictionary with two keys: 'data' and 'metadata'.
+
             'data' : dict
-                A dictionary containing the prompts and responses.
+                A dictionary containing the prompts and the attribute words they contain.
+
                 'prompt' : list
                     A list of prompts.
+
                 'attribute_words' : list
                     A list of attribute_words in each prompt.
+
             'metadata' : dict
                 A dictionary containing metadata related to FTU.
+
                 'ftu_satisfied' : boolean
                     Boolean indicator of whether or not prompts satisfy FTU
+               
                 'filtered_prompt_count' : int
                     The number of prompts that satisfy FTU.
         """
diff --git a/langfair/generator/generator.py b/langfair/generator/generator.py
index 7393eff..72b7e8f 100644
--- a/langfair/generator/generator.py
+++ b/langfair/generator/generator.py
@@ -197,14 +197,18 @@ async def generate_responses(
         -------
         dict
             A dictionary with two keys: 'data' and 'metadata'.
+
             'data' : dict
                 A dictionary containing the prompts and responses.
+
                 'prompt' : list
                     A list of prompts.
                 'response' : list
                     A list of responses corresponding to the prompts.
+
             'metadata' : dict
                 A dictionary containing metadata about the generation process.
+                
                 'non_completion_rate' : float
                     The rate at which the generation process did not complete.
                 'temperature' : float
diff --git a/langfair/metrics/classification/metrics/baseclass/metrics.py b/langfair/metrics/classification/metrics/baseclass/metrics.py
index fe073ce..3882a7f 100644
--- a/langfair/metrics/classification/metrics/baseclass/metrics.py
+++ b/langfair/metrics/classification/metrics/baseclass/metrics.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import Optional,List
 
 from numpy.typing import ArrayLike
 
@@ -38,7 +38,24 @@ def evaluate(
         pass
 
     @staticmethod
-    def binary_confusion_matrix(y_true, y_pred):
+    def binary_confusion_matrix(y_true, y_pred) -> List[List[float]]:
+        """
+        Method for computing binary confusion matrix    
+
+        Parameters
+        ----------
+        y_true : Array-like
+            Binary labels (ground truth values)
+
+        y_pred : Array-like
+            Binary model predictions    
+
+        Returns
+        -------
+        List[List[float]]
+            2x2 confusion matrix 
+
+        """
         cm = [[0, 0], [0, 0]]
         for i in range(len(y_pred)):
             if y_pred[i] == y_true[i]:
diff --git a/poetry.lock b/poetry.lock
index 6264f9d..5aab03a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1010,13 +1010,13 @@ files = [
 
 [[package]]
 name = "jinja2"
-version = "3.1.4"
+version = "3.1.5"
 description = "A very fast and expressive template engine."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"},
-    {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"},
+    {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"},
+    {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"},
 ]
 
 [package.dependencies]
diff --git a/pyproject.toml b/pyproject.toml
index e1832f4..d44a206 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langfair"
-version = "0.3.1"
+version = "0.3.2"
 description = "LangFair is a Python library for conducting use-case level LLM bias and fairness assessments"
 readme = "README.md"
 authors = ["Dylan Bouchard <dylan.bouchard@cvshealth.com>",