updated input path in partition

gicraveiro · Feb 8, 2022 · 1fa79ea · 1fa79ea
1 parent 1835b16
commit 1fa79ea
Showing 1 changed file with 9 additions and 14 deletions.
diff --git a/partition.py b/partition.py
@@ -1,3 +1,4 @@
+# Divides annotated dataset in train set, dev set and test set
 import pandas as pd
 import re
 from sklearn.model_selection import train_test_split
@@ -95,7 +96,7 @@ def remove_empty_sentences(sents, labels):
 # MAIN
 
 # Reads annotation table from file .csv saved locally and creates labels and senences list
-annotation = pd.read_csv("data/Privacy/Facebook/Annotated/AnnotatedMultiLabelDataset.csv")
+annotation = pd.read_csv("data/Facebook/Annotated/AnnotatedMultiLabelDataset.csv")
 sents = annotation['Sentences'].values
 labels1 = annotation['Primary Label'].values
 labels2 = annotation['Secondary Label'].values
@@ -115,32 +116,31 @@ def remove_empty_sentences(sents, labels):
 # Partitions remaining 20% into dev set (10%) and test set (10%)
 sents_test, sents_dev, labels_test, labels_dev = train_test_split(sents_test,labels_test, test_size=0.5, stratify=labels_test, random_state=1111111)
 
-nlp = spacy.load('en_core_web_lg',disable=['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) 
+nlp = spacy.load('en_core_web_lg',disable=['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) # this was to save memory
 
 # Preprocessing
 # Removes empty sentences
 sents_train, labels_train = remove_empty_sentences(sents_train, labels_train)
 sents_dev, labels_dev = remove_empty_sentences(sents_dev, labels_dev)
 sents_test, labels_test = remove_empty_sentences(sents_test, labels_test)
-# save a json, separate labels and sents, use a dictionary in python
+# creates a dictionary with sentences and their labels
 train_dict = create_sent_label_dict(sents_train, labels_train)
 dev_dict = create_sent_label_dict(sents_dev, labels_dev)
 test_dict = create_sent_label_dict(sents_test, labels_test)
 total_dict = train_dict + dev_dict + test_dict
 
-# create output files and write sentences with labels
+# creates json files with reference sentences and their labels as output
 write_partition_file(train_dict, 'train')
 write_partition_file(dev_dict, 'dev')
 write_partition_file(test_dict, 'test')
 
-# FLAG - Check if files were written correctly - CHECKED
-
+# Get reference labels list
 total_labels_ref_list = [sent['label'] for sent in total_dict]
 train_labels_ref_list = [sent['label'] for sent in train_dict]
 dev_labels_ref_list = [sent['label'] for sent in dev_dict]
 test_labels_ref_list = [sent['label'] for sent in test_dict]
 
-# Multilabel distribution count + chart
+# Multilabel distribution counts + graphs
 path= 'output/partition/multilabelDistribution.txt'
 os.makedirs(os.path.dirname(path), exist_ok=True)
 with open(path, 'w') as file:
@@ -158,15 +158,13 @@ def remove_empty_sentences(sents, labels):
 plot_distribution(counter, "Test", "multilabel")
 write_distribution(path, counter, "Test")
 
-# FLAG - CHECK IF DISTRIBUTION IS BEING MEASURED CORRECTLY - Checked, total distribution is the same as the spreadsheet ones 
-
-# Single label distribution count + chart
+# Filter labels keeping only primary ones
 total_ref_primary_label = [label[0] for label in total_labels_ref_list]
 train_ref_primary_label = [label[0] for label in train_labels_ref_list]
 dev_ref_primary_label = [label[0] for label in dev_labels_ref_list]
 test_ref_primary_label = [label[0] for label in test_labels_ref_list]
 
-### OTHER APPROACHES FOR CHOOSING THE LABEL FOR EVALUATION -- check start of implementation at the end of the code
+# Only primary label distribution counts + graphs
 path= 'output/partition/1labelDistribution.txt'
 os.makedirs(os.path.dirname(path), exist_ok=True)
 with open(path, 'w') as file:
@@ -183,6 +181,3 @@ def remove_empty_sentences(sents, labels):
 counter = Counter(test_ref_primary_label)
 plot_distribution(counter,"Test", "1label")
 write_distribution(path, counter, "Test")
-
-# FLAG - CHECK IF DISTRIBUTION IS BEING DONE AND MEASURED CORRECTLY
-# FLAG - in theory checked, but RECHECK rechecked