From 9a9d54ca8a0e53e264172e3f084e8f996873d4ab Mon Sep 17 00:00:00 2001
From: Daniel Khashabi <danielk@allenai.org>
Date: Thu, 11 Nov 2021 13:22:53 -0800
Subject: [PATCH] analysis scripts.

---
 aggregate.py | 229 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 229 insertions(+)
 create mode 100644 aggregate.py

diff --git a/aggregate.py b/aggregate.py
new file mode 100644
index 0000000..899f79e
--- /dev/null
+++ b/aggregate.py
@@ -0,0 +1,229 @@
+import csv
+import statistics
+
+taskA_idx = 13
+taskB_idx = 14
+optimize_against_A_idx = 9
+target_prompt_idx = 12
+gamma_idx = 6
+
+continuous_prompt_f1_idx = 5
+continuous_prompt_f1_sem_idx = 11
+
+mapped_continuous_prompt_f1_idx = 4
+mapped_continuous_prompt_f1_sem_idx = 10
+
+
+all_rows = []
+with open("/Users/danielk/ideaProjects/Channel-LM-Prompting/continuous_prompt_experiments_nov11.csv", 'r') as csvfile:
+    csvreader = csv.reader(csvfile)
+    for row in csvreader:
+        all_rows.append(row)
+
+def filter(taskA, optimized_against_A, gamma=None, taskB=None, target_prompt_length=None):
+    filtered_rows = []
+    for row in all_rows:
+        if target_prompt_length != None and len(row[target_prompt_idx].split(" ")) != target_prompt_length:
+            continue
+
+        if taskB and taskB not in row[taskB_idx]:
+            continue
+
+        if taskA and row[taskA_idx] != taskA:
+            continue
+
+        if optimized_against_A != None and row[optimize_against_A_idx] != optimized_against_A:
+            continue
+
+        if gamma != None and float(row[gamma_idx]) != gamma:
+            continue
+
+        filtered_rows.append(row)
+
+    count = len(filtered_rows)
+
+    if count > 0:
+        avg_f1 = statistics.mean([float(row[continuous_prompt_f1_idx]) for row in filtered_rows])
+        avg_f1_sem = statistics.mean([float(row[continuous_prompt_f1_sem_idx]) for row in filtered_rows])
+
+        mapped_avg_f1 = statistics.mean([float(row[mapped_continuous_prompt_f1_idx]) for row in filtered_rows])
+        mapped_avg_f1_sem = statistics.mean([float(row[mapped_continuous_prompt_f1_sem_idx]) for row in filtered_rows])
+
+        return {
+            'count': count,
+            'avg_f1': avg_f1,
+            'avg_f1_sem': avg_f1_sem,
+            'mapped_avg_f1': mapped_avg_f1,
+            'mapped_avg_f1_sem': mapped_avg_f1_sem
+        }
+    else:
+        return None
+
+all_tasks = ['agnews', 'SST-2', 'trec', 'subj', 'sst-5']
+
+if True:
+    ## aggregated scores for tasks
+    f1_list = []
+    f1_sem_list = []
+    for taskA in all_tasks:
+        r1 = filter(taskA=taskA, optimized_against_A='false', gamma=0)
+        r2 = filter(taskA=taskA, optimized_against_A='false', gamma=0.01, taskB='task')
+        r3 = filter(taskA=taskA, optimized_against_A='false', gamma=0.01, taskB='prompt')
+        f1_list.append([r1['avg_f1'], r2['avg_f1'], r3['avg_f1']])
+        f1_sem_list.append([r1['avg_f1_sem'], r2['avg_f1_sem'], r3['avg_f1_sem']])
+
+        print(f"{taskA} \t "
+              f"{r1['avg_f1']} \t {r1['avg_f1_sem']} \t "
+              f"{r2['avg_f1']} \t {r2['avg_f1_sem']} \t "
+              f"{r3['avg_f1']} \t {r3['avg_f1_sem']} \t "
+              f"{r1['mapped_avg_f1']} \t {r1['mapped_avg_f1_sem']} \t "
+              f"{r2['mapped_avg_f1']} \t {r2['mapped_avg_f1_sem']} \t "
+              f"{r3['mapped_avg_f1']} \t {r3['mapped_avg_f1_sem']} \t ")
+        print(f"{taskA} (count) \t "
+              f"{r1['count']} \t {r1['count']} \t "
+              f"{r2['count']} \t {r2['count']} \t "
+              f"{r3['count']} \t {r3['count']} \t "
+              f"{r1['count']} \t {r1['count']} \t "
+              f"{r2['count']} \t {r2['count']} \t "
+              f"{r3['count']} \t {r3['count']} \t ")
+
+
+    import numpy as np
+    import matplotlib.pyplot as plt
+
+    data = np.array(f1_list)
+    data_err = np.array(f1_sem_list)
+    length = len(data)
+
+    # Set plot parameters
+    fig, ax = plt.subplots()
+    width = 0.2
+    x = np.arange(length)
+
+    ax.bar(x, data[:,0], width, color='blue', label='no projection', yerr=data_err[:, 0])
+    ax.bar(x + width, data[:,1], width, color='red', label='projection=other tasks', yerr=data_err[:, 1])
+    ax.bar(x + (2 * width), data[:,2], width, color='orange', label='projection=random string', yerr=data_err[:, 2])
+
+    ax.set_ylabel('F1')
+    ax.set_ylim(0,1.2)
+    ax.set_xticks(x + width + width/2)
+    ax.set_xticklabels(all_tasks)
+    ax.set_xlabel('Downstream tasks')
+    # ax.set_title('Title')
+    ax.legend()
+    plt.grid(True, 'major', 'y', ls='--', lw=.5, c='k', alpha=.3)
+
+    fig.tight_layout()
+    plt.show()
+
+
+if True:
+    ## aggregated scores for tasks
+    x_labels = []
+    f1_list = []
+    f1_sem_list = []
+    for length in range(2, 45):
+        for taskA in all_tasks:
+            r1 = filter(taskA, optimized_against_A='false', gamma=0, target_prompt_length=length)
+            r2 = filter(taskA, optimized_against_A='false', gamma=0.01, taskB='task', target_prompt_length=length)
+            r3 = filter(taskA, optimized_against_A='false', gamma=0.01, taskB='prompt', target_prompt_length=length)
+            if not r1 or not r2 or not r3:
+                continue
+
+            scores = [r1['avg_f1']]
+            sem = [r1['avg_f1_sem']]
+            if r2 != None:
+                scores.append(r2['avg_f1'])
+                sem.append(r2['avg_f1_sem'])
+            else:
+                scores.append(0.0)
+                sem.append(0.0)
+
+            if r3 != None:
+                scores.append(r3['avg_f1'])
+                sem.append(r3['avg_f1_sem'])
+            else:
+                scores.append(0.0)
+                sem.append(0.0)
+
+            f1_list.append(scores)
+            f1_sem_list.append(sem)
+            x_labels.append(f'{taskA}_{length}')
+
+            # print(f"{taskA} \t "
+            #       f"{r1['avg_f1']} \t {r1['avg_f1_sem']} \t "
+            #       f"{r2['avg_f1']} \t {r2['avg_f1_sem']} \t "
+            #       f"{r3['avg_f1']} \t {r3['avg_f1_sem']} \t "
+            #       f"{r1['mapped_avg_f1']} \t {r1['mapped_avg_f1_sem']} \t "
+            #       f"{r2['mapped_avg_f1']} \t {r2['mapped_avg_f1_sem']} \t "
+            #       f"{r3['mapped_avg_f1']} \t {r3['mapped_avg_f1_sem']} \t ")
+            # print(f"{taskA} (count) \t "
+            #       f"{r1['count']} \t {r1['count']} \t "
+            #       f"{r2['count']} \t {r2['count']} \t "
+            #       f"{r3['count']} \t {r3['count']} \t "
+            #       f"{r1['count']} \t {r1['count']} \t "
+            #       f"{r2['count']} \t {r2['count']} \t "
+            #       f"{r3['count']} \t {r3['count']} \t ")
+
+    import numpy as np
+    import matplotlib.pyplot as plt
+    from matplotlib.pyplot import figure
+
+    data = np.array(f1_list)
+    data_err = np.array(f1_sem_list)
+    print(f1_list)
+    print(f1_sem_list)
+    length = len(data)
+
+    # Set plot parameters
+    fig, ax = plt.subplots()
+    fig.set_figwidth(40)
+    # plt.figure(figsize=(40,10))
+    width = 0.2
+    x = np.arange(length)
+
+    ax.bar(x, data[:,0], width, color='blue', label='no projection', yerr=data_err[:, 0])
+    ax.bar(x + width, data[:,1], width, color='red', label='projection=other tasks', yerr=data_err[:, 1])
+    ax.bar(x + (2 * width), data[:,2], width, color='orange', label='projection=random string', yerr=data_err[:, 2])
+    ax.set_xticklabels(ax.get_xticks(), rotation = 45)
+
+
+    ax.set_ylabel('F1')
+    ax.set_ylim(0,1)
+    ax.set_xticks(x + width + width/2)
+    ax.set_xticklabels(x_labels)
+    ax.set_xlabel('Downstream tasks')
+    # ax.set_title('Title')
+    ax.legend()
+    plt.grid(True, 'major', 'y', ls='--', lw=.5, c='k', alpha=.3)
+
+    # fig.tight_layout()
+    plt.show()
+
+
+if False:
+    ## accuracy as a function of prompt length
+    import matplotlib.pyplot as plt
+    import pandas as pd
+
+    for taskA in all_tasks:
+        length_values = []
+        f1_values = []
+        for length in range(1, 50):
+            r1 = filter(taskA=taskA, optimized_against_A='false', target_prompt_length=length)
+            if r1:
+                length_values.append(length)
+                f1_values.append(r1['avg_f1'])
+
+        # Data
+        df=pd.DataFrame({'prompt length': length_values, 'f1': f1_values })
+
+        # multiple line plots
+        plt.plot( 'prompt length', 'f1', data=df, marker='o', markerfacecolor='blue', markersize=12, color='skyblue', linewidth=4)
+        plt.title(f'{taskA} F1 as a function of promot length (tokens)')
+
+        # show legend
+        plt.legend()
+
+        # show graph
+        plt.show()