Fixes: Factual Accuracy and Response Completeness Tests and Add json5…

… to dependencies (#661) * Fix response consistency test * Fix factual accuracy explanation * Add json5 dependency
uptrain-ai · Mar 20, 2024 · b691308 · b691308
1 parent 8d525e7
commit b691308
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 16 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["setuptools", "wheel"]
 
 [project]
 name = "uptrain"
-version = "0.6.8"
+version = "0.6.9"
 description = "UpTrain - tool to evaluate LLM applications on aspects like factual accuracy, response quality, retrieval quality, tonality, etc."
 readme = "README.md"
 maintainers = [{ name = "UpTrain AI Team", email = "[email protected]" }]
@@ -33,7 +33,8 @@ dependencies = [
     "openai>=1.6.1",
     "fsspec",
     "litellm",
-    "pyyaml"
+    "pyyaml",
+    "json5"
 ]
 
 [project.urls]

diff --git a/tests/test_builtins.py b/tests/test_builtins.py
@@ -164,25 +164,18 @@ def test_check_response_consistency():
     assert isinstance(output, pl.DataFrame)
     assert (
         "score_response_consistency" in output.columns
-        and "argument_response_consistency" in output.columns
+        and "explanation_response_consistency" in output.columns
     )     
     assert (
         output["score_response_consistency"].dtype == pl.Float64 
         and len(output["score_response_consistency"]) 
         - output["score_response_consistency"].null_count() > 0
     )
     assert (
-        output["argument_response_consistency"].dtype == pl.Utf8 
-        and len(output["argument_response_consistency"]) 
-        - output["argument_response_consistency"].null_count() > 0
+        output["explanation_response_consistency"].dtype == pl.Utf8 
+        and len(output["explanation_response_consistency"]) 
+        - output["explanation_response_consistency"].null_count() > 0
     )
-    if settings.eval_type == "cot":
-        assert "reasoning_response_consistency" in output.columns
-        assert (
-            output["reasoning_response_consistency"].dtype == pl.Utf8 
-            and len(output["reasoning_response_consistency"]) 
-            - output["reasoning_response_consistency"].null_count() > 0
-        )
 
 
 response_matching_dataset = pl.DataFrame(

diff --git a/uptrain/operators/language/factual_accuracy.py b/uptrain/operators/language/factual_accuracy.py
@@ -230,11 +230,10 @@ def evaluate_local(self, data):
                 "explanation_factual_accuracy": None,
             }
             try:
-                result = parse_json(res.response.choices[0].message.content)["Result"]
-                judgements = [x["Judgement"] for x in result]
+                judgements = [x["Judgement"] for x in parse_json(res.response.choices[0].message.content)["Result"]]
                 score = np.mean([self.score_mapping[x.lower()] for x in judgements])
                 output["score_factual_accuracy"] = float(score)
-                output["explanation_factual_accuracy"] = result
+                output["explanation_factual_accuracy"] = res.response.choices[0].message.content
             except Exception:
                 logger.error(
                     f"Error when processing payload at index {idx}: {res.error}"