From c2e52fde8b9dd8b922e1c1781a0a7f3501f96827 Mon Sep 17 00:00:00 2001 From: ljvmiranda921 Date: Fri, 11 Oct 2024 20:58:03 -0700 Subject: [PATCH 1/9] [WIP] Update --- analysis/avg_agreement_final.py | 118 +++++++++++++------------------- 1 file changed, 48 insertions(+), 70 deletions(-) diff --git a/analysis/avg_agreement_final.py b/analysis/avg_agreement_final.py index cb93b89..0ff6d4f 100644 --- a/analysis/avg_agreement_final.py +++ b/analysis/avg_agreement_final.py @@ -2,63 +2,39 @@ import matplotlib.pyplot as plt import numpy as np +FONT_SIZES = {"small": 12, "medium": 16, "large": 18} + +PLOT_PARAMS = { + "font.family": "serif", + "font.serif": ["Times New Roman", "STIX"], + "font.size": FONT_SIZES.get("medium"), + "axes.titlesize": FONT_SIZES.get("large"), + "axes.labelsize": FONT_SIZES.get("large"), + "xtick.labelsize": FONT_SIZES.get("large"), + "ytick.labelsize": FONT_SIZES.get("large"), + "legend.fontsize": FONT_SIZES.get("medium"), + "figure.titlesize": FONT_SIZES.get("medium"), + "text.usetex": False, +} + +plt.rcParams.update(PLOT_PARAMS) + + data = { - "meta-llama/Meta-Llama-3.1-8B-Instruct": [ - 0.3533086666014079, - 0.052422082615756406 - ], - "cohere/c4ai-aya-23-35b": [ - 0.43767196047824003, - 0.026040919354464294 - ], - "cohere/c4ai-aya-23-8b": [ - 0.013483014909052663, - 0.03363706833599835 - ], - "cohere/command-r-08-2024": [ - 0.374457668650282, - 0.02926089754079793 - ], - "cohere/command-r-plus-08-2024": [ - 0.3830841816733316, - 0.020185255968455686 - ], - "google/gemma-1.1-7b-it": [ - 0.5190375637539242, - 0.027757722654111305 - ], - "google/gemma-2-9b-it": [ - 0.5181663123111222, - 0.031090119385244894 - ], - "meta-llama/Meta-Llama-3-70B-Instruct": [ - 0.5685224105896568, - 0.04853344616275034 - ], - "meta-llama/Meta-Llama-3-8B-Instruct": [ - 0.37936948540837095, - 0.032172769265151994 - ], - "meta-llama/Meta-Llama-3.1-70B-Instruct": [ - 0.603536768244583, - 0.027191895488989915 - ], - "mistralai/Mistral-7B-Instruct-v0.2": [ - 0.4071166722276529, - 0.04577594028555328 - ], - "mistralai/Mistral-7B-Instruct-v0.3": [ - 0.41195018984687265, - 0.056184679972755454 - ], - "openai/gpt-4-turbo-2024-04-09": [ - 0.6106943361444249, - 0.02932446842558468 - ], - "openai/gpt-4o-2024-05-13": [ - 0.5833874065757011, - 0.023695391445384514 - ] + "LlaMa 3.1 8B": [0.3533086666014079, 0.052422082615756406], + "Aya 23 35B": [0.43767196047824003, 0.026040919354464294], + # "Aya 23 8B": [0.013483014909052663, 0.03363706833599835], + "Command R": [0.374457668650282, 0.02926089754079793], + "Command R+": [0.3830841816733316, 0.020185255968455686], + "Gemma 1.1 7B": [0.5190375637539242, 0.027757722654111305], + "Gemma 2 9B": [0.5181663123111222, 0.031090119385244894], + "LlaMa 3 70B": [0.5685224105896568, 0.04853344616275034], + "LlaMa 3 8B": [0.37936948540837095, 0.032172769265151994], + "LlaMa 3.1 70B": [0.603536768244583, 0.027191895488989915], + "Mistal 7B v0.2": [0.4071166722276529, 0.04577594028555328], + "Mistral 7B v0.3": [0.41195018984687265, 0.056184679972755454], + "GPT-4 Turbo": [0.6106943361444249, 0.02932446842558468], + "GPT-4o": [0.5833874065757011, 0.023695391445384514], } sorted_data = dict(sorted(data.items(), key=lambda item: item[1][0])) @@ -66,27 +42,29 @@ means_sorted = [v[0] for v in sorted_data.values()] std_devs_sorted = [v[1] for v in sorted_data.values()] -sns.set(style="whitegrid") -palette = sns.color_palette("coolwarm", len(labels_sorted)) +# sns.set(style="whitegrid") +# palette = sns.color_palette("coolwarm", len(labels_sorted)) -plt.figure(figsize=(10, 6)) +plt.figure(figsize=(10, 5)) x_pos_sorted = np.arange(len(labels_sorted)) -ax1 = sns.barplot(x=x_pos_sorted, y=means_sorted, palette=palette, errorbar=None) -plt.errorbar(x_pos_sorted, means_sorted, yerr=std_devs_sorted, fmt='none', c='black', capsize=5) +ax1 = sns.barplot(x=x_pos_sorted, y=means_sorted, errorbar=None, color="green") +plt.errorbar(x_pos_sorted, means_sorted, yerr=std_devs_sorted, fmt="none", c="black", capsize=5) -ax1.spines['top'].set_color('black') -ax1.spines['right'].set_color('black') -ax1.spines['left'].set_color('black') -ax1.spines['bottom'].set_color('black') -for spine in ax1.spines.values(): - spine.set_linewidth(2) # Make the border thicker +# ax1.spines["top"].set_color("black") +# ax1.spines["right"].set_color("black") +# ax1.spines["left"].set_color("black") +# ax1.spines["bottom"].set_color("black") +# for spine in ax1.spines.values(): +# spine.set_linewidth(2) # Make the border thicker +plt.grid(color="gray", axis="y", alpha=0.2) plt.ylim(0, 0.8) +plt.gca().set_axisbelow(True) -plt.xticks(x_pos_sorted, labels_sorted, rotation=90) +plt.xticks(x_pos_sorted, labels_sorted, rotation=45, ha="right") plt.ylabel("Cohen's Kappa") -plt.title('Average Inner-Model Agreement Across Languages') +plt.title("Average Inner-Model Agreement Across Languages") plt.tight_layout() -plt.savefig(f"./innermodel_agreement.pdf", bbox_inches='tight') \ No newline at end of file +plt.savefig("plots/innermodel_agreement_green_oracle.pdf", bbox_inches="tight") From 2fb0bed4d74b3d5a1fb07cd08130a4db3eff0fa7 Mon Sep 17 00:00:00 2001 From: ljvmiranda921 Date: Fri, 11 Oct 2024 23:20:26 -0700 Subject: [PATCH 2/9] [WIP] Update --- analysis/avg_agreement_final.py | 2 +- analysis/plot_results.py | 38 +++++++++++++++++++++++++++------ 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/analysis/avg_agreement_final.py b/analysis/avg_agreement_final.py index 0ff6d4f..0b5dd3e 100644 --- a/analysis/avg_agreement_final.py +++ b/analysis/avg_agreement_final.py @@ -45,7 +45,7 @@ # sns.set(style="whitegrid") # palette = sns.color_palette("coolwarm", len(labels_sorted)) -plt.figure(figsize=(10, 5)) +plt.figure(figsize=(7, 7)) x_pos_sorted = np.arange(len(labels_sorted)) ax1 = sns.barplot(x=x_pos_sorted, y=means_sorted, errorbar=None, color="green") diff --git a/analysis/plot_results.py b/analysis/plot_results.py index 2fbb64f..0e130fc 100644 --- a/analysis/plot_results.py +++ b/analysis/plot_results.py @@ -13,7 +13,7 @@ PLOT_PARAMS = { "font.family": "serif", - "font.serif": ["Times New Roman", "STIX"], + "font.serif": ["Times", "Times New Roman", "STIX"], "font.size": FONT_SIZES.get("medium"), "axes.titlesize": FONT_SIZES.get("large"), "axes.labelsize": FONT_SIZES.get("large"), @@ -122,6 +122,7 @@ def plot_main_heatmap( df = pd.read_csv(input_path) # Remove unnecessary column df.pop("eng_Latn") + df.pop("Family") df = df.sort_values(by="Avg_Multilingual", ascending=False).head(10).reset_index(drop=True) data = df[[col for col in df.columns if col not in ["Model_Type"]]].rename(columns={"Avg_Multilingual": "Avg"}) @@ -133,14 +134,37 @@ def plot_main_heatmap( data.pop("zho_Hant") data = data[sorted(data.columns)] data.columns = [col.split("_")[0] for col in data.columns] + data["Var"] = data[list(LANG_STANDARDIZATION.keys())].var(axis=1) data = data.rename(columns=LANG_STANDARDIZATION) - fig, ax = plt.subplots(1, 1, figsize=figsize) - sns.heatmap(data, ax=ax, cmap="YlGn", annot=True, annot_kws={"size": 16}, fmt=".2f", cbar=False) - ax.xaxis.set_ticks_position("top") - ax.tick_params(axis="x") - ax.set_ylabel("") - ax.set_yticklabels([f"{model} " for model in data.index]) + lang_results = data[list(LANG_STANDARDIZATION.values())] + avg = data[["Avg"]] + var = data[["Var"]] + + fig, axs = plt.subplots(ncols=3, figsize=figsize, gridspec_kw={"width_ratios": [0.5, 0.5, 9]}, sharey=True) + + sns.heatmap(avg, ax=axs[0], cmap="YlGn", annot=True, annot_kws={"size": 16}, fmt=".2f", cbar=False) + axs[0].xaxis.set_ticks_position("top") + axs[0].set_xticklabels(avg.columns, fontsize=20) + axs[0].tick_params(axis="x") + axs[0].set_ylabel("") + axs[0].set_yticklabels([f"{model} " for model in avg.index], fontsize=20) + + sns.heatmap(var, ax=axs[1], cmap="YlGn", annot=True, annot_kws={"size": 16}, fmt=".2f", cbar=False) + axs[1].xaxis.set_ticks_position("top") + axs[1].set_xticklabels(var.columns, fontsize=20) + axs[1].tick_params(axis="x") + axs[1].set_ylabel("") + axs[1].tick_params(axis="y", length=0) + axs[1].set_yticklabels([f"{model} " for model in var.index], fontsize=20) + + sns.heatmap(lang_results, ax=axs[2], cmap="YlGn", annot=True, annot_kws={"size": 16}, fmt=".2f", cbar=False) + axs[2].xaxis.set_ticks_position("top") + axs[2].set_xticklabels(lang_results.columns, fontsize=20) + axs[2].tick_params(axis="x") + axs[2].tick_params(axis="y", length=0) + axs[2].set_ylabel("") + axs[2].set_yticklabels([f"{model} " for model in lang_results.index], fontsize=20) plt.tight_layout() fig.savefig(output_path, bbox_inches="tight") From 9d6e82c0a8c2402f41ef534c8df5cbc6c8130550 Mon Sep 17 00:00:00 2001 From: ljvmiranda921 Date: Fri, 11 Oct 2024 23:46:32 -0700 Subject: [PATCH 3/9] [WIP] Update --- analysis/plot_results.py | 43 +++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/analysis/plot_results.py b/analysis/plot_results.py index 0e130fc..21bbd02 100644 --- a/analysis/plot_results.py +++ b/analysis/plot_results.py @@ -179,7 +179,7 @@ def plot_eng_drop_line( from scipy.stats import pearsonr, spearmanr df = pd.read_csv(input_path) - df = df[["Model", "Model_Type", "eng_Latn", "Avg_Multilingual"]] + df = df[["Model", "Model_Type", "Family", "eng_Latn", "Avg_Multilingual"]] df = df.sort_values(by="Avg_Multilingual", ascending=False).reset_index(drop=True) data = df.set_index("Model").dropna() data[data.select_dtypes(include="number").columns] = data.select_dtypes(include="number") * 100 @@ -191,6 +191,16 @@ def plot_eng_drop_line( fig, ax = plt.subplots(figsize=figsize) colors = ["red", "green", "blue"] + family = { + "Independent": "o", + "Qwen": "x", + "Skywork": "P", + "Cohere": "*", + "OpenAI": "s", + "AllenAI": "D", + "OpenBMB": "H", + "Meta": "^", + } for (label, group), color in zip(data.groupby("Model_Type"), colors): mrewardbench_scores = group["Avg_Multilingual"] rewardbench_scores = group["eng_Latn"] @@ -212,22 +222,23 @@ def plot_eng_drop_line( ax.set_aspect("equal") ax.legend(frameon=False, handletextpad=0.2, fontsize=12) - model_names = [MODEL_STANDARDIZATION[model] for model in data.index] - texts = [ - ax.text( - rewardbench_scores[idx], - mrewardbench_scores[idx], - model_names[idx], - fontsize=14, + if top_n: + model_names = [MODEL_STANDARDIZATION[model] for model in data.index] + texts = [ + ax.text( + rewardbench_scores[idx], + mrewardbench_scores[idx], + model_names[idx], + fontsize=14, + ) + for idx in range(len(data)) + ] + adjust_text( + texts, + ax=ax, + force_static=0.15, + arrowprops=dict(arrowstyle="->", color="gray"), ) - for idx in range(len(data)) - ] - adjust_text( - texts, - ax=ax, - # force_static=0.15, - arrowprops=dict(arrowstyle="->", color="gray"), - ) # ax.text( # 0.6, From ae5d9018479cc59ff15056e96f97143b173e7cb1 Mon Sep 17 00:00:00 2001 From: ljvmiranda921 Date: Sat, 12 Oct 2024 12:43:52 -0700 Subject: [PATCH 4/9] [wip] Update --- analysis/plot_results.py | 44 ++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/analysis/plot_results.py b/analysis/plot_results.py index 0e130fc..13cf511 100644 --- a/analysis/plot_results.py +++ b/analysis/plot_results.py @@ -142,15 +142,17 @@ def plot_main_heatmap( var = data[["Var"]] fig, axs = plt.subplots(ncols=3, figsize=figsize, gridspec_kw={"width_ratios": [0.5, 0.5, 9]}, sharey=True) + cmap = "Greys" + fmt = ".1f" - sns.heatmap(avg, ax=axs[0], cmap="YlGn", annot=True, annot_kws={"size": 16}, fmt=".2f", cbar=False) + sns.heatmap(avg, ax=axs[0], cmap=cmap, annot=True, annot_kws={"size": 16}, fmt=fmt, cbar=False) axs[0].xaxis.set_ticks_position("top") axs[0].set_xticklabels(avg.columns, fontsize=20) axs[0].tick_params(axis="x") axs[0].set_ylabel("") axs[0].set_yticklabels([f"{model} " for model in avg.index], fontsize=20) - sns.heatmap(var, ax=axs[1], cmap="YlGn", annot=True, annot_kws={"size": 16}, fmt=".2f", cbar=False) + sns.heatmap(var, ax=axs[1], cmap=cmap, annot=True, annot_kws={"size": 16}, fmt=fmt, cbar=False) axs[1].xaxis.set_ticks_position("top") axs[1].set_xticklabels(var.columns, fontsize=20) axs[1].tick_params(axis="x") @@ -158,7 +160,7 @@ def plot_main_heatmap( axs[1].tick_params(axis="y", length=0) axs[1].set_yticklabels([f"{model} " for model in var.index], fontsize=20) - sns.heatmap(lang_results, ax=axs[2], cmap="YlGn", annot=True, annot_kws={"size": 16}, fmt=".2f", cbar=False) + sns.heatmap(lang_results, ax=axs[2], cmap=cmap, annot=True, annot_kws={"size": 16}, fmt=fmt, cbar=False) axs[2].xaxis.set_ticks_position("top") axs[2].set_xticklabels(lang_results.columns, fontsize=20) axs[2].tick_params(axis="x") @@ -191,10 +193,11 @@ def plot_eng_drop_line( fig, ax = plt.subplots(figsize=figsize) colors = ["red", "green", "blue"] - for (label, group), color in zip(data.groupby("Model_Type"), colors): + markers = ["o", "*", "D"] + for (label, group), marker in zip(data.groupby("Model_Type"), markers): mrewardbench_scores = group["Avg_Multilingual"] rewardbench_scores = group["eng_Latn"] - ax.scatter(rewardbench_scores, mrewardbench_scores, marker="o", s=40, label=label, color=color) + ax.scatter(rewardbench_scores, mrewardbench_scores, marker=marker, s=60, label=label, color="k") mrewardbench_scores = data["Avg_Multilingual"] rewardbench_scores = data["eng_Latn"] @@ -212,22 +215,23 @@ def plot_eng_drop_line( ax.set_aspect("equal") ax.legend(frameon=False, handletextpad=0.2, fontsize=12) - model_names = [MODEL_STANDARDIZATION[model] for model in data.index] - texts = [ - ax.text( - rewardbench_scores[idx], - mrewardbench_scores[idx], - model_names[idx], - fontsize=14, + if top_n: + model_names = [MODEL_STANDARDIZATION[model] for model in data.index] + texts = [ + ax.text( + rewardbench_scores[idx], + mrewardbench_scores[idx], + model_names[idx], + fontsize=14, + ) + for idx in range(len(data)) + ] + adjust_text( + texts, + ax=ax, + # force_static=0.15, + arrowprops=dict(arrowstyle="->", color="gray"), ) - for idx in range(len(data)) - ] - adjust_text( - texts, - ax=ax, - # force_static=0.15, - arrowprops=dict(arrowstyle="->", color="gray"), - ) # ax.text( # 0.6, From 47026a18e1f9ff89e6a0e8e45fa6285a929f4535 Mon Sep 17 00:00:00 2001 From: ljvmiranda921 Date: Sat, 12 Oct 2024 12:59:29 -0700 Subject: [PATCH 5/9] [wip] Update --- analysis/plot_results.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/analysis/plot_results.py b/analysis/plot_results.py index 13cf511..5fd586a 100644 --- a/analysis/plot_results.py +++ b/analysis/plot_results.py @@ -298,7 +298,8 @@ def plot_ling_dims( y=dim, data=lingdf, ax=ax, - color="green", + color="gray", + edgecolor="k", width=0.4 if dim == "Resource Availability" else 0.7, ) ax.set_title(dim) From e422fce11b20daadf7c8490880ca2263060020ed Mon Sep 17 00:00:00 2001 From: ljvmiranda921 Date: Sat, 12 Oct 2024 13:02:23 -0700 Subject: [PATCH 6/9] [wip] Update --- analysis/avg_agreement_final.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis/avg_agreement_final.py b/analysis/avg_agreement_final.py index 0b5dd3e..57cb36f 100644 --- a/analysis/avg_agreement_final.py +++ b/analysis/avg_agreement_final.py @@ -48,7 +48,7 @@ plt.figure(figsize=(7, 7)) x_pos_sorted = np.arange(len(labels_sorted)) -ax1 = sns.barplot(x=x_pos_sorted, y=means_sorted, errorbar=None, color="green") +ax1 = sns.barplot(x=x_pos_sorted, y=means_sorted, errorbar=None, color="gray") plt.errorbar(x_pos_sorted, means_sorted, yerr=std_devs_sorted, fmt="none", c="black", capsize=5) # ax1.spines["top"].set_color("black") From a77432dead38281972d566f87ba1748673caffab Mon Sep 17 00:00:00 2001 From: ljvmiranda921 Date: Sat, 12 Oct 2024 13:02:44 -0700 Subject: [PATCH 7/9] [wip] Update --- analysis/avg_agreement_final.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis/avg_agreement_final.py b/analysis/avg_agreement_final.py index 57cb36f..fb33d66 100644 --- a/analysis/avg_agreement_final.py +++ b/analysis/avg_agreement_final.py @@ -48,7 +48,7 @@ plt.figure(figsize=(7, 7)) x_pos_sorted = np.arange(len(labels_sorted)) -ax1 = sns.barplot(x=x_pos_sorted, y=means_sorted, errorbar=None, color="gray") +ax1 = sns.barplot(x=x_pos_sorted, y=means_sorted, errorbar=None, color="gray", edgecolor="k") plt.errorbar(x_pos_sorted, means_sorted, yerr=std_devs_sorted, fmt="none", c="black", capsize=5) # ax1.spines["top"].set_color("black") From 12d649235962514314d229318dd53a24640aa773 Mon Sep 17 00:00:00 2001 From: ljvmiranda921 Date: Sat, 12 Oct 2024 18:01:48 -0700 Subject: [PATCH 8/9] [WIP] Update --- analysis/plot_results.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/analysis/plot_results.py b/analysis/plot_results.py index c155515..9b58f03 100644 --- a/analysis/plot_results.py +++ b/analysis/plot_results.py @@ -66,6 +66,8 @@ "zho": "zh", } +COLORS = {"green": "#355145", "purple": "#d8a6e5", "orange": "#fe7759"} + def get_args(): # fmt: off @@ -192,12 +194,19 @@ def plot_eng_drop_line( fig, ax = plt.subplots(figsize=figsize) - colors = ["red", "green", "blue"] + colors = [COLORS.get("green"), COLORS.get("purple"), COLORS.get("orange")] markers = ["o", "*", "D"] - for (label, group), marker in zip(data.groupby("Model_Type"), markers): + for (label, group), color in zip(data.groupby("Model_Type"), colors): mrewardbench_scores = group["Avg_Multilingual"] rewardbench_scores = group["eng_Latn"] - ax.scatter(rewardbench_scores, mrewardbench_scores, marker=marker, s=60, label=label, color="k") + ax.scatter( + rewardbench_scores, + mrewardbench_scores, + marker="o", + s=60, + label=label, + color=color, + ) mrewardbench_scores = data["Avg_Multilingual"] rewardbench_scores = data["eng_Latn"] From 0ab3aaa1a9920ae81dad73ef73f58fc86ad831f5 Mon Sep 17 00:00:00 2001 From: ljvmiranda921 Date: Sat, 12 Oct 2024 18:08:09 -0700 Subject: [PATCH 9/9] [WIP] Update --- analysis/avg_agreement_final.py | 9 ++++++++- analysis/plot_results.py | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/analysis/avg_agreement_final.py b/analysis/avg_agreement_final.py index fb33d66..25ed807 100644 --- a/analysis/avg_agreement_final.py +++ b/analysis/avg_agreement_final.py @@ -3,6 +3,7 @@ import numpy as np FONT_SIZES = {"small": 12, "medium": 16, "large": 18} +COLORS = {"green": "#355145", "purple": "#d8a6e5", "orange": "#fe7759"} PLOT_PARAMS = { "font.family": "serif", @@ -48,7 +49,13 @@ plt.figure(figsize=(7, 7)) x_pos_sorted = np.arange(len(labels_sorted)) -ax1 = sns.barplot(x=x_pos_sorted, y=means_sorted, errorbar=None, color="gray", edgecolor="k") +ax1 = sns.barplot( + x=x_pos_sorted, + y=means_sorted, + errorbar=None, + color=COLORS.get("orange"), + edgecolor=COLORS.get("green"), +) plt.errorbar(x_pos_sorted, means_sorted, yerr=std_devs_sorted, fmt="none", c="black", capsize=5) # ax1.spines["top"].set_color("black") diff --git a/analysis/plot_results.py b/analysis/plot_results.py index 9b58f03..1ec28c4 100644 --- a/analysis/plot_results.py +++ b/analysis/plot_results.py @@ -307,8 +307,8 @@ def plot_ling_dims( y=dim, data=lingdf, ax=ax, - color="gray", - edgecolor="k", + color=COLORS.get("orange"), + edgecolor=COLORS.get("green"), width=0.4 if dim == "Resource Availability" else 0.7, ) ax.set_title(dim)