added creative writing category to category.py and config (#3584)

lm-sys · Oct 16, 2024 · 7ca0d43 · 7ca0d43
1 parent dd90e21
commit 7ca0d43
Show file tree

Hide file tree

Showing 4 changed files with 177 additions and 0 deletions.
diff --git a/fastchat/serve/monitor/classify/README.md b/fastchat/serve/monitor/classify/README.md
@@ -0,0 +1,60 @@
+## Download dataset
+We have pre-generated several category classifier benchmarks and ground truths. You can download them (with [`git-lfs`](https://git-lfs.com) installed) to the directory `classify/` by running
+```console
+> git clone https://huggingface.co/datasets/lmarena-ai/categories-benchmark-eval
+// cd into classify/ and then copy the label_bench directory to the current directory
+> cp -r categories-benchmark-eval/label_bench . 
+```
+Your label_bench directory should follow the structure:
+```markdown
+├── label_bench/
+│   ├── creative_writing_bench/
+│   │   ├── data/
+│   │   │    └── llama-v3p1-70b-instruct.json
+│   │   └── test.json
+│   ├── ...
+│   ├── your_bench_name/
+│   │   ├── data/
+│   │   │    ├── your_classifier_data_1.json
+│   │   │    ├── your_classifier_data_2.json
+│   │   │    └── ...
+│   │   └── test.json (your ground truth)
+└── ...
+```
+
+## How to evaluate your category classifier?
+
+To test your new classifier for a new category, you would have to make sure you created the category child class in `category.py`. Then, to generate classification labels, make the necessary edits in `config.yaml` and run
+```console
+python label.py --config config.yaml --testing
+```
+
+Then, add your new category bench to `tag_names` in `display_score.py`. After making sure that you also have a correctly formatted ground truth json file, you can report the performance of your classifier by running
+```console
+python display_score.py --bench <your_bench>
+```
+
+If you want to check out conflicts between your classifier and ground truth, use
+```console
+python display_score.py --bench <your_bench> --display-conflict
+```
+
+Example output:
+```console
+> python display_score.py --bench if_bench --display-conflict
+Model: gpt-4o-mini-2024-07-18
+Accuracy: 0.967
+Precision: 0.684
+Recall: 0.918
+
+###### CONFLICT ######
+
+Ground Truth = True; Pred = False
+\####################
+...
+
+Ground Truth = False; Pred = True
+\####################
+...
+```
+
diff --git a/fastchat/serve/monitor/classify/category.py b/fastchat/serve/monitor/classify/category.py
@@ -24,6 +24,8 @@ def create_category(name):
             return CategoryIF()
         elif name == "math_v0.1":
             return CategoryMath()
+        elif name == "creative_writing_v0.1":
+            return CategoryCreativeWriting()
 
         raise Exception(f"Category name is incorrect: {name}")
 
@@ -134,3 +136,41 @@ def pre_process(self, prompt):
     def post_process(self, judgment):
         score = self.get_score(judgment=judgment)
         return {"math": bool(score == "yes") if score else False}
+
+
+class CategoryCreativeWriting(Category):
+    def __init__(self):
+        super().__init__()
+        self.name_tag = "creative_writing_v0.1"
+        self.pattern = re.compile(r"<decision>(\w+)<\/decision>")
+        self.system_prompt = 'You are tasked with determining whether a given user prompt is asking for creative writing. Creative writing is defined as any form of writing that goes beyond standard professional, journalistic, academic, or technical literature. It typically involves imagination, originality, and expression of thoughts and emotions. Creative writing can include, but is not limited to, the following formats:\n- Fiction (e.g., short stories, novels)\n- Poetry (e.g., sonnets, free verse)\n- Dramatic writing (e.g., screenplays, monologues, scripts)\n- Personal essays (focusing on subjective experiences or narrative storytelling)\n- Songs and lyrics\n\nCarefully analyze the user prompt and consider whether it primarily requires creative writing. Think about the following aspects:\n1. Does the prompt ask for fictional content, speculative scenarios, or the use of imagination to construct narratives?\n2. Does it encourage the expression of thoughts, emotions, or personal experiences beyond mere factual reporting or analysis?\n3. Is it asking for writing in a specific creative format (e.g., story, poem, script, etc)?\n4. Is the primary purpose of the prompt to foster creative expression or originality rather than information delivery, technical documentation, or analytical reasoning?\n5. Does the prompt request stylistic or rhetorical elements often associated with creative writing, such as metaphor, imagery, dialogue, etc?\n6. Does the prompt expect a response in natural language (e.g., sentences, paragraphs) rather than visual, mathematical, or non-linguistic output?\n\nOutput your verdict as either "yes" or "no"in the following format:\n<decision>\n[yes/no]\n</decision>. Do NOT explain.'
+        self.prompt_template = "<user_prompt>\n{PROMPT}\n</user_prompt>"
+
+    def get_score(self, judgment):
+        matches = self.pattern.findall(
+            judgment.replace("\n", "")
+            .replace("[", "")
+            .replace("]", "")
+            .replace(" ", "")
+            .lower()
+        )
+        matches = [m for m in matches if m != ""]
+        if len(set(matches)) == 0:
+            return None
+        elif len(set(matches)) == 1:
+            return matches[0]
+        else:
+            return None
+
+    def pre_process(self, prompt):
+        args = {"PROMPT": prompt}
+        conv = [
+            {"role": "system", "content": self.system_prompt},
+            {"role": "user", "content": self.prompt_template.format(**args)},
+        ]
+        return conv
+
+    def post_process(self, judgment):
+        score = self.get_score(judgment=judgment)
+        bool_score = bool(score == "yes") if score else False
+        return {"creative_writing": bool_score, "score": score}
diff --git a/fastchat/serve/monitor/classify/config.yaml b/fastchat/serve/monitor/classify/config.yaml
@@ -10,6 +10,7 @@ task_name:
   - criteria_v0.1
   - if_v0.1
   - math_v0.1
+  - creative_writing_v0.1
 
 model_name: null
 name: llama-3-70b-instruct

diff --git a/fastchat/serve/monitor/classify/display_score.py b/fastchat/serve/monitor/classify/display_score.py
@@ -0,0 +1,76 @@
+import pandas as pd
+import argparse
+import os
+from glob import glob
+from sklearn.metrics import recall_score, precision_score
+
+tag_names = {
+    "if_bench": ("if_v0.1", "if"),
+    "math_bench": ("math_v0.1", "math"),
+    "hard_bench": ("criteria_v0.1", "hard"),
+    "creative_writing_bench": ("creative_writing_v0.1", "creative_writing"),
+}
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--bench", type=str, default="if_bench")
+    parser.add_argument("--display-conflict", action="store_true")
+    args = parser.parse_args()
+    assert args.bench in tag_names, "Not valid bench argument, add bench if needed."
+
+    test = pd.read_json(os.path.join("label_bench", args.bench, "test.json"))
+
+    for file in glob(os.path.join("label_bench", args.bench, "data", "*.json")):
+        output = pd.read_json(file)
+
+        tag_map = (
+            output[["question_id", "category_tag"]]
+            .set_index("question_id")
+            .to_dict("index")
+        )
+
+        tag_1, tag_2 = tag_names[args.bench]
+        test["pred"] = test.question_id.map(
+            lambda id: tag_map[id]["category_tag"][tag_1][tag_2]
+        )
+
+        accuracy = (test.label == test.pred).mean()
+        recall = recall_score(y_pred=test.pred, y_true=test.label)
+        precision = precision_score(y_pred=test.pred, y_true=test.label)
+
+        print(f"Model: {output.model[0]}")
+        print(f"Accuracy: {round(accuracy, 3)}")
+        print(f"Precision: {round(precision, 3)}")
+        print(f"Recall: {round(recall, 3)}")
+
+        if args.display_conflict:
+            print()
+            print("###### CONFLICT ######")
+            print()
+            conflict = test[test.label & ~test.pred]
+            print("Ground Truth = True; Pred = False")
+            prompts = (
+                conflict.conversation_a.map(lambda x: x[0]["content"])
+                .sample(n=5)
+                .tolist()
+            )
+            for prompt in prompts:
+                print("####################")
+                print(prompt)
+            print()
+            print()
+
+            conflict = test[~test.label & test.pred]
+            print("Ground Truth = False; Pred = True")
+            prompts = (
+                conflict.conversation_a.map(lambda x: x[0]["content"])
+                .sample(n=5)
+                .tolist()
+            )
+            for prompt in prompts:
+                print("####################")
+                print(prompt)
+            print()
+            print()
+        print()