-
Notifications
You must be signed in to change notification settings - Fork 4.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into update_preset
- Loading branch information
Showing
12 changed files
with
244 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import pandas as pd | ||
import re | ||
import argparse | ||
|
||
from tqdm import tqdm | ||
|
||
tqdm.pandas() | ||
|
||
|
||
def count_markdown_elements(markdown_text, suffix): | ||
counters = { | ||
f"header_count{suffix}": { | ||
"h1": len(re.findall(r"^#{1}\s", markdown_text, re.MULTILINE)), | ||
"h2": len(re.findall(r"^#{2}\s", markdown_text, re.MULTILINE)), | ||
"h3": len(re.findall(r"^#{3}\s", markdown_text, re.MULTILINE)), | ||
"h4": len(re.findall(r"^#{4}\s", markdown_text, re.MULTILINE)), | ||
"h5": len(re.findall(r"^#{5}\s", markdown_text, re.MULTILINE)), | ||
"h6": len(re.findall(r"^#{6}\s", markdown_text, re.MULTILINE)), | ||
}, | ||
f"list_count{suffix}": { | ||
"ordered": len(re.findall(r"^\s*\d+\.\s", markdown_text, re.MULTILINE)), | ||
"unordered": len(re.findall(r"^\s*[-*+]\s", markdown_text, re.MULTILINE)), | ||
}, | ||
f"bold_count{suffix}": { | ||
"**": len(re.findall(r"\*\*[^*\n]+\*\*", markdown_text)), | ||
"__": len(re.findall(r"__[^_\n]+__", markdown_text)), | ||
}, | ||
} | ||
return counters | ||
|
||
|
||
def remove_pattern(answer, pattern): | ||
blocks = pattern.findall(answer) | ||
for block in blocks: | ||
answer = answer.replace(block, "") | ||
return answer | ||
|
||
|
||
def get_element_counts(df, column): | ||
pattern = re.compile("```([^`]*)```") | ||
answers = df[column].map( | ||
lambda convo: "\n".join( | ||
[turn["content"] for turn in convo if turn["role"] == "assistant"] | ||
) | ||
) | ||
results = answers.progress_map( | ||
lambda answer: count_markdown_elements( | ||
remove_pattern(answer, pattern), | ||
suffix=column[-2:], # Remove code block first | ||
) | ||
) | ||
|
||
return results.tolist() | ||
|
||
|
||
def add_markdown_meta(row): | ||
conv_meta = {k: v for k, v in row["conv_metadata"].items()} | ||
return conv_meta | row["markdown_meta_a"] | row["markdown_meta_b"] | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--input-file", type=str, required=True) | ||
parser.add_argument("--output-file", type=str, required=True) | ||
args = parser.parse_args() | ||
|
||
print("loading file...") | ||
data = pd.read_json(args.input_file) | ||
|
||
assert "conv_metadata" in data.columns | ||
|
||
temp = data[["question_id", "conv_metadata"]].copy() | ||
|
||
print("Processing conversation_a") | ||
temp["markdown_meta_a"] = get_element_counts(data, column="conversation_a") | ||
|
||
print("Processing conversation_b") | ||
temp["markdown_meta_b"] = get_element_counts(data, column="conversation_b") | ||
|
||
print("Post-processing...") | ||
data["conv_metadata"] = temp.apply(add_markdown_meta, axis=1) | ||
|
||
print("Saving to file...") | ||
data.to_json(args.output_file, orient="records", indent=4, force_ascii=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.