Skip to content

Commit

Permalink
added creative writing category to category.py and config (#3584)
Browse files Browse the repository at this point in the history
  • Loading branch information
Y-A-Song authored Oct 16, 2024
1 parent dd90e21 commit 7ca0d43
Show file tree
Hide file tree
Showing 4 changed files with 177 additions and 0 deletions.
60 changes: 60 additions & 0 deletions fastchat/serve/monitor/classify/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
## Download dataset
We have pre-generated several category classifier benchmarks and ground truths. You can download them (with [`git-lfs`](https://git-lfs.com) installed) to the directory `classify/` by running
```console
> git clone https://huggingface.co/datasets/lmarena-ai/categories-benchmark-eval
// cd into classify/ and then copy the label_bench directory to the current directory
> cp -r categories-benchmark-eval/label_bench .
```
Your label_bench directory should follow the structure:
```markdown
├── label_bench/
│ ├── creative_writing_bench/
│ │ ├── data/
│ │ │ └── llama-v3p1-70b-instruct.json
│ │ └── test.json
│ ├── ...
│ ├── your_bench_name/
│ │ ├── data/
│ │ │ ├── your_classifier_data_1.json
│ │ │ ├── your_classifier_data_2.json
│ │ │ └── ...
│ │ └── test.json (your ground truth)
└── ...
```

## How to evaluate your category classifier?

To test your new classifier for a new category, you would have to make sure you created the category child class in `category.py`. Then, to generate classification labels, make the necessary edits in `config.yaml` and run
```console
python label.py --config config.yaml --testing
```

Then, add your new category bench to `tag_names` in `display_score.py`. After making sure that you also have a correctly formatted ground truth json file, you can report the performance of your classifier by running
```console
python display_score.py --bench <your_bench>
```

If you want to check out conflicts between your classifier and ground truth, use
```console
python display_score.py --bench <your_bench> --display-conflict
```

Example output:
```console
> python display_score.py --bench if_bench --display-conflict
Model: gpt-4o-mini-2024-07-18
Accuracy: 0.967
Precision: 0.684
Recall: 0.918

###### CONFLICT ######

Ground Truth = True; Pred = False
\####################
...

Ground Truth = False; Pred = True
\####################
...
```

40 changes: 40 additions & 0 deletions fastchat/serve/monitor/classify/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ def create_category(name):
return CategoryIF()
elif name == "math_v0.1":
return CategoryMath()
elif name == "creative_writing_v0.1":
return CategoryCreativeWriting()

raise Exception(f"Category name is incorrect: {name}")

Expand Down Expand Up @@ -134,3 +136,41 @@ def pre_process(self, prompt):
def post_process(self, judgment):
score = self.get_score(judgment=judgment)
return {"math": bool(score == "yes") if score else False}


class CategoryCreativeWriting(Category):
def __init__(self):
super().__init__()
self.name_tag = "creative_writing_v0.1"
self.pattern = re.compile(r"<decision>(\w+)<\/decision>")
self.system_prompt = 'You are tasked with determining whether a given user prompt is asking for creative writing. Creative writing is defined as any form of writing that goes beyond standard professional, journalistic, academic, or technical literature. It typically involves imagination, originality, and expression of thoughts and emotions. Creative writing can include, but is not limited to, the following formats:\n- Fiction (e.g., short stories, novels)\n- Poetry (e.g., sonnets, free verse)\n- Dramatic writing (e.g., screenplays, monologues, scripts)\n- Personal essays (focusing on subjective experiences or narrative storytelling)\n- Songs and lyrics\n\nCarefully analyze the user prompt and consider whether it primarily requires creative writing. Think about the following aspects:\n1. Does the prompt ask for fictional content, speculative scenarios, or the use of imagination to construct narratives?\n2. Does it encourage the expression of thoughts, emotions, or personal experiences beyond mere factual reporting or analysis?\n3. Is it asking for writing in a specific creative format (e.g., story, poem, script, etc)?\n4. Is the primary purpose of the prompt to foster creative expression or originality rather than information delivery, technical documentation, or analytical reasoning?\n5. Does the prompt request stylistic or rhetorical elements often associated with creative writing, such as metaphor, imagery, dialogue, etc?\n6. Does the prompt expect a response in natural language (e.g., sentences, paragraphs) rather than visual, mathematical, or non-linguistic output?\n\nOutput your verdict as either "yes" or "no"in the following format:\n<decision>\n[yes/no]\n</decision>. Do NOT explain.'
self.prompt_template = "<user_prompt>\n{PROMPT}\n</user_prompt>"

def get_score(self, judgment):
matches = self.pattern.findall(
judgment.replace("\n", "")
.replace("[", "")
.replace("]", "")
.replace(" ", "")
.lower()
)
matches = [m for m in matches if m != ""]
if len(set(matches)) == 0:
return None
elif len(set(matches)) == 1:
return matches[0]
else:
return None

def pre_process(self, prompt):
args = {"PROMPT": prompt}
conv = [
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": self.prompt_template.format(**args)},
]
return conv

def post_process(self, judgment):
score = self.get_score(judgment=judgment)
bool_score = bool(score == "yes") if score else False
return {"creative_writing": bool_score, "score": score}
1 change: 1 addition & 0 deletions fastchat/serve/monitor/classify/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ task_name:
- criteria_v0.1
- if_v0.1
- math_v0.1
- creative_writing_v0.1

model_name: null
name: llama-3-70b-instruct
Expand Down
76 changes: 76 additions & 0 deletions fastchat/serve/monitor/classify/display_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import pandas as pd
import argparse
import os
from glob import glob
from sklearn.metrics import recall_score, precision_score

tag_names = {
"if_bench": ("if_v0.1", "if"),
"math_bench": ("math_v0.1", "math"),
"hard_bench": ("criteria_v0.1", "hard"),
"creative_writing_bench": ("creative_writing_v0.1", "creative_writing"),
}


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--bench", type=str, default="if_bench")
parser.add_argument("--display-conflict", action="store_true")
args = parser.parse_args()
assert args.bench in tag_names, "Not valid bench argument, add bench if needed."

test = pd.read_json(os.path.join("label_bench", args.bench, "test.json"))

for file in glob(os.path.join("label_bench", args.bench, "data", "*.json")):
output = pd.read_json(file)

tag_map = (
output[["question_id", "category_tag"]]
.set_index("question_id")
.to_dict("index")
)

tag_1, tag_2 = tag_names[args.bench]
test["pred"] = test.question_id.map(
lambda id: tag_map[id]["category_tag"][tag_1][tag_2]
)

accuracy = (test.label == test.pred).mean()
recall = recall_score(y_pred=test.pred, y_true=test.label)
precision = precision_score(y_pred=test.pred, y_true=test.label)

print(f"Model: {output.model[0]}")
print(f"Accuracy: {round(accuracy, 3)}")
print(f"Precision: {round(precision, 3)}")
print(f"Recall: {round(recall, 3)}")

if args.display_conflict:
print()
print("###### CONFLICT ######")
print()
conflict = test[test.label & ~test.pred]
print("Ground Truth = True; Pred = False")
prompts = (
conflict.conversation_a.map(lambda x: x[0]["content"])
.sample(n=5)
.tolist()
)
for prompt in prompts:
print("####################")
print(prompt)
print()
print()

conflict = test[~test.label & test.pred]
print("Ground Truth = False; Pred = True")
prompts = (
conflict.conversation_a.map(lambda x: x[0]["content"])
.sample(n=5)
.tolist()
)
for prompt in prompts:
print("####################")
print(prompt)
print()
print()
print()

0 comments on commit 7ca0d43

Please sign in to comment.