Skip to content

Commit 2669ec3

Browse files
committed
format
1 parent 02943c7 commit 2669ec3

File tree

12 files changed

+267
-128
lines changed

12 files changed

+267
-128
lines changed

fastchat/serve/monitor/basic_stats.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def report_basic_stats(log_files):
142142
height=300,
143143
width=1200,
144144
)
145-
145+
146146
# calculate conversion rate for each day (vote / chat)
147147
conversion_rate = {}
148148
for date in chat_dates_counts.index:
@@ -163,7 +163,6 @@ def report_basic_stats(log_files):
163163
width=1200,
164164
)
165165

166-
167166
# Model call counts
168167
model_hist_all = df_all[df_all["type"] == "chat"]["model"].value_counts()
169168
model_hist_1_day = df_1_day[df_1_day["type"] == "chat"]["model"].value_counts()

fastchat/serve/monitor/clean_battle_data.py

+66-26
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def replace_model_name(old_name, tstamp):
146146
"dumbledore-v3": "llama-3.2-vision-90b-instruct",
147147
"potter-v1": "llama-3.2-vision-11b-instruct",
148148
"sharp-game-player-v1": "llama-3.2-3b-instruct",
149-
"zeus-flare-thunder-v1": "llama-3.2-1b-instruct",
149+
"zeus-flare-thunder-v1": "llama-3.2-1b-instruct",
150150
"qwen-vl-max-0809": "qwen2-vl-72b",
151151
"gemini-1.5-pro-002-test-sp": "gemini-1.5-pro-002",
152152
"gemini-1.5-flash-test-5": "gemini-1.5-flash-002",
@@ -185,7 +185,7 @@ def replace_model_name(old_name, tstamp):
185185
return old_name
186186
else:
187187
return old_name + "-old"
188-
if old_name == "eureka-chatbot":
188+
if old_name == "eureka-chatbot":
189189
if tstamp > 1721651521:
190190
return "eureka-chatbot-v2"
191191
else:
@@ -293,12 +293,17 @@ def process_data_txt2img(
293293
if row["models"][0] is None or row["models"][1] is None:
294294
count_dict["invalid"] += 1
295295
continue
296-
if not isinstance(row["models"][0], str) or not isinstance(row["models"][1], str):
296+
if not isinstance(row["models"][0], str) or not isinstance(
297+
row["models"][1], str
298+
):
297299
count_dict["invalid"] += 1
298300
continue
299301

300302
# Resolve model names
301-
models_public = [remove_html(row["models"][0]), remove_html(row["models"][1])]
303+
models_public = [
304+
remove_html(row["models"][0]),
305+
remove_html(row["models"][1]),
306+
]
302307
if "model_name" in row["states"][0]:
303308
models_hidden = [
304309
row["states"][0]["model_name"],
@@ -356,12 +361,12 @@ def process_data_txt2img(
356361
"anony_bothbad_vote": 0,
357362
"anony_leftvote": 0,
358363
"anony_rightvote": 0,
359-
"sanitized_id": shortuuid.uuid()
364+
"sanitized_id": shortuuid.uuid(),
360365
}
361366
all_ips[ip]["count"] += 1
362367
if flag_anony:
363368
all_ips[ip]["anony_count"] += 1
364-
all_ips[ip]["anony_"+row["type"]] += 1
369+
all_ips[ip]["anony_" + row["type"]] += 1
365370

366371
if sanitize_ip:
367372
user_id = f"{all_ips[ip]['sanitized_id']}"
@@ -389,6 +394,7 @@ def process_data_txt2img(
389394
)
390395
return battles, count_dict, count_leak, all_ips
391396

397+
392398
def process_data(
393399
data,
394400
exclude_model_names,
@@ -433,12 +439,17 @@ def process_data(
433439
if row["models"][0] is None or row["models"][1] is None:
434440
count_dict["invalid"] += 1
435441
continue
436-
if not isinstance(row["models"][0], str) or not isinstance(row["models"][1], str):
442+
if not isinstance(row["models"][0], str) or not isinstance(
443+
row["models"][1], str
444+
):
437445
count_dict["invalid"] += 1
438446
continue
439447

440448
# Resolve model names
441-
models_public = [remove_html(row["models"][0]), remove_html(row["models"][1])]
449+
models_public = [
450+
remove_html(row["models"][0]),
451+
remove_html(row["models"][1]),
452+
]
442453
if "model_name" in row["states"][0]:
443454
models_hidden = [
444455
row["states"][0]["model_name"],
@@ -484,7 +495,6 @@ def process_data(
484495
print(state["messages"][state["offset"]][1])
485496
raise ValueError
486497

487-
488498
# Drop conversations if the model names are leaked
489499
messages = ""
490500
for i in range(2):
@@ -576,12 +586,12 @@ def process_data(
576586
"anony_bothbad_vote": 0,
577587
"anony_leftvote": 0,
578588
"anony_rightvote": 0,
579-
"sanitized_id": shortuuid.uuid()
589+
"sanitized_id": shortuuid.uuid(),
580590
}
581591
all_ips[ip]["count"] += 1
582592
if flag_anony:
583593
all_ips[ip]["anony_count"] += 1
584-
all_ips[ip]["anony_"+row["type"]] += 1
594+
all_ips[ip]["anony_" + row["type"]] += 1
585595

586596
if sanitize_ip:
587597
user_id = f"{all_ips[ip]['sanitized_id']}"
@@ -607,13 +617,25 @@ def process_data(
607617
)
608618

609619
user_tokens = sum(
610-
[conv["num_tokens"] for conv in conversation_a if conv["role"] == "user"]
620+
[
621+
conv["num_tokens"]
622+
for conv in conversation_a
623+
if conv["role"] == "user"
624+
]
611625
)
612626
assistant_a_tokens = sum(
613-
[conv["num_tokens"] for conv in conversation_a if conv["role"] == "assistant"]
627+
[
628+
conv["num_tokens"]
629+
for conv in conversation_a
630+
if conv["role"] == "assistant"
631+
]
614632
)
615633
assistant_b_tokens = sum(
616-
[conv["num_tokens"] for conv in conversation_b if conv["role"] == "assistant"]
634+
[
635+
conv["num_tokens"]
636+
for conv in conversation_b
637+
if conv["role"] == "assistant"
638+
]
617639
)
618640
context_tokens_a = sum([conv["num_tokens"] for conv in conversation_a[:-1]])
619641
context_tokens_b = sum([conv["num_tokens"] for conv in conversation_b[:-1]])
@@ -702,30 +724,36 @@ def clean_battle_data(
702724
all_ips[ip]["count"] += sub_all_ips[ip]["count"]
703725
all_ips[ip]["anony_count"] += sub_all_ips[ip]["anony_count"]
704726
all_ips[ip]["anony_tievote"] += sub_all_ips[ip]["anony_tievote"]
705-
all_ips[ip]["anony_bothbad_vote"] += sub_all_ips[ip]["anony_bothbad_vote"]
727+
all_ips[ip]["anony_bothbad_vote"] += sub_all_ips[ip][
728+
"anony_bothbad_vote"
729+
]
706730
all_ips[ip]["anony_leftvote"] += sub_all_ips[ip]["anony_leftvote"]
707731
all_ips[ip]["anony_rightvote"] += sub_all_ips[ip]["anony_rightvote"]
708732

709733
battles.sort(key=lambda x: x["tstamp"])
710734
last_updated_tstamp = battles[-1]["tstamp"]
711-
735+
712736
battles = pd.DataFrame(battles)
713-
737+
714738
# drop rows with same question_id
715739
print(f"before drop dups #battles: {len(battles)}")
716740
battles = battles.drop_duplicates(subset=["question_id"], keep="first")
717741
battles = battles.reset_index(drop=True)
718742
print(f"#battles: {len(battles)}")
719743

720-
battles = battles[battles["anony"]].reset_index(drop=True) if anony_only else battles
744+
battles = (
745+
battles[battles["anony"]].reset_index(drop=True) if anony_only else battles
746+
)
721747
if run_dedup and not (vision or txt2img):
722748
print("Running deduplication...")
723749
battles = utils.dedup_process(battles)
724750
num_dedup_battles = sum(battles["dedup_tag"].apply(lambda x: x["sampled"]))
725751
print(f"#dedup_battles: {num_dedup_battles}")
726752
else:
727753
print("Skip deduplication...")
728-
dedup_tags = np.array([{"high_freq": False, "sampled": True} for _ in range(len(battles))])
754+
dedup_tags = np.array(
755+
[{"high_freq": False, "sampled": True} for _ in range(len(battles))]
756+
)
729757
battles["dedup_tag"] = dedup_tags
730758

731759
last_updated_datetime = datetime.datetime.fromtimestamp(
@@ -746,7 +774,9 @@ def clean_battle_data(
746774
for votetype in ["tievote", "bothbad_vote", "leftvote", "rightvote"]:
747775
vote_key = "anony_" + votetype
748776
userid_key = "sanitized_id" if sanitize_ip else "ip"
749-
top_30_users = sorted(all_ips.values(), key=lambda x: x[vote_key], reverse=True)[:30]
777+
top_30_users = sorted(
778+
all_ips.values(), key=lambda x: x[vote_key], reverse=True
779+
)[:30]
750780
top_30_ip_id = ["arena_user_" + ip[userid_key] for ip in top_30_users]
751781
battles_top_30_ips = battles[battles["judge"].isin(top_30_ip_id)]
752782
print(f"Top 30 IPs #battles: {len(battles_top_30_ips)}")
@@ -755,13 +785,15 @@ def clean_battle_data(
755785
for user in top_30_users:
756786
user_ip = user["ip"]
757787
user_id = "arena_user_" + user[userid_key]
758-
788+
759789
ip_battles = battles_top_30_ips[battles_top_30_ips["judge"] == user_id]
760790
win_count = len(ip_battles[ip_battles["winner"] == "model_a"])
761791
tie_count = len(ip_battles[ip_battles["winner"] == "tie"])
762792
loss_count = len(ip_battles[ip_battles["winner"] == "model_b"])
763-
print(f"{user_id}: model_a {win_count}, tie {tie_count}, mobel_b {loss_count}, {user_ip}")
764-
793+
print(
794+
f"{user_id}: model_a {win_count}, tie {tie_count}, mobel_b {loss_count}, {user_ip}"
795+
)
796+
765797
return battles
766798

767799

@@ -784,8 +816,14 @@ def clean_battle_data(
784816
ban_ip_list = json.load(open(args.ban_ip_file)) if args.ban_ip_file else None
785817

786818
battles = clean_battle_data(
787-
log_files, args.exclude_model_names or [], ban_ip_list, args.sanitize_ip, anony_only=args.anony_only,
788-
run_dedup=args.run_dedup, vision=args.vision, txt2img=args.txt2img
819+
log_files,
820+
args.exclude_model_names or [],
821+
ban_ip_list,
822+
args.sanitize_ip,
823+
anony_only=args.anony_only,
824+
run_dedup=args.run_dedup,
825+
vision=args.vision,
826+
txt2img=args.txt2img,
789827
)
790828
last_updated_tstamp = battles.iloc[-1]["tstamp"]
791829
cutoff_date = datetime.datetime.fromtimestamp(
@@ -801,7 +839,9 @@ def clean_battle_data(
801839
print(f"Write cleaned data to {output}")
802840

803841
if not args.txt2img:
804-
battles = battles.drop(columns=["conversation_a", "conversation_b", "question_id"])
842+
battles = battles.drop(
843+
columns=["conversation_a", "conversation_b", "question_id"]
844+
)
805845
print("Samples:")
806846
print(battles[:5])
807847

fastchat/serve/monitor/clean_chat_data.py

+15-9
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,9 @@ def get_log_files(max_num_files=None, is_vision=False):
4141
prefix = ""
4242
if is_vision:
4343
prefix = "vision-tmp-"
44-
name = os.path.expanduser(f"~/fastchat_logs/server{i}/{prefix}{d}-conv.json")
44+
name = os.path.expanduser(
45+
f"~/fastchat_logs/server{i}/{prefix}{d}-conv.json"
46+
)
4547
if os.path.exists(name):
4648
filenames.append(name)
4749
max_num_files = max_num_files or len(filenames)
@@ -120,9 +122,7 @@ def clean_chat_data(log_files, action_type, remove_prompt=False, exclude_models=
120122
msg = x["content"]
121123
if isinstance(x["content"], list):
122124
msg = x["content"][0]
123-
x["num_tokens"] = len(
124-
encoding.encode(msg, allowed_special="all")
125-
)
125+
x["num_tokens"] = len(encoding.encode(msg, allowed_special="all"))
126126
messages_concat += msg.lower()
127127

128128
if remove_prompt:
@@ -185,21 +185,27 @@ def clean_chat_data(log_files, action_type, remove_prompt=False, exclude_models=
185185
parser.add_argument("--action-type", type=str, default="chat")
186186
parser.add_argument("--max-num-files", type=int)
187187
parser.add_argument("--vision", action="store_true")
188-
parser.add_argument("--start-time", type=str) # example: 2024-08-01
189-
parser.add_argument("--end-time", type=str) # example: 2024-08-01
188+
parser.add_argument("--start-time", type=str) # example: 2024-08-01
189+
parser.add_argument("--end-time", type=str) # example: 2024-08-01
190190
parser.add_argument("--remove-prompt", action="store_true")
191191
parser.add_argument("--exclude-models", type=str, nargs="+", default=[])
192192
args = parser.parse_args()
193193

194194
log_files = get_log_files(args.max_num_files, args.vision)
195195
# print(log_files)
196-
chats = clean_chat_data(log_files, args.action_type, args.remove_prompt, args.exclude_models)
196+
chats = clean_chat_data(
197+
log_files, args.action_type, args.remove_prompt, args.exclude_models
198+
)
197199
print(len(chats))
198200
# convert to dataframe
199201
chats = pd.DataFrame(chats)
200202
if args.start_time is not None:
201-
chats = chats[pd.to_datetime(chats["tstamp"], unit="s") >= pd.to_datetime(args.start_time)]
202-
chats = chats[pd.to_datetime(chats["tstamp"], unit='s') < pd.to_datetime(args.end_time)]
203+
chats = chats[
204+
pd.to_datetime(chats["tstamp"], unit="s") >= pd.to_datetime(args.start_time)
205+
]
206+
chats = chats[
207+
pd.to_datetime(chats["tstamp"], unit="s") < pd.to_datetime(args.end_time)
208+
]
203209
print(len(chats))
204210

205211
last_updated_tstamp = chats.iloc[-1]["tstamp"]

fastchat/serve/monitor/dataset_release_scripts/lmsys_chat_1m/compute_stats.py

+2
Original file line numberDiff line numberDiff line change
@@ -98,11 +98,13 @@ def to_remove(x):
9898
# "lmsys/vicuna-7b-v1.5", use_fast=False
9999
# )
100100

101+
101102
def num_tokens_from_string(string: str) -> int:
102103
encoding = tiktoken.encoding_for_model("gpt-4")
103104
num_tokens = len(encoding.encode(string))
104105
return num_tokens
105106

107+
106108
prompts = []
107109
responses = []
108110
for conv in df["conversation"]:

0 commit comments

Comments
 (0)