From 1fa79ea000c45874c4c389d31b721e63c6702360 Mon Sep 17 00:00:00 2001 From: gicraveiro Date: Tue, 8 Feb 2022 13:33:19 +0100 Subject: [PATCH] updated input path in partition --- partition.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/partition.py b/partition.py index 80d40ca..75547af 100644 --- a/partition.py +++ b/partition.py @@ -1,3 +1,4 @@ +# Divides annotated dataset in train set, dev set and test set import pandas as pd import re from sklearn.model_selection import train_test_split @@ -95,7 +96,7 @@ def remove_empty_sentences(sents, labels): # MAIN # Reads annotation table from file .csv saved locally and creates labels and senences list -annotation = pd.read_csv("data/Privacy/Facebook/Annotated/AnnotatedMultiLabelDataset.csv") +annotation = pd.read_csv("data/Facebook/Annotated/AnnotatedMultiLabelDataset.csv") sents = annotation['Sentences'].values labels1 = annotation['Primary Label'].values labels2 = annotation['Secondary Label'].values @@ -115,32 +116,31 @@ def remove_empty_sentences(sents, labels): # Partitions remaining 20% into dev set (10%) and test set (10%) sents_test, sents_dev, labels_test, labels_dev = train_test_split(sents_test,labels_test, test_size=0.5, stratify=labels_test, random_state=1111111) -nlp = spacy.load('en_core_web_lg',disable=['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) +nlp = spacy.load('en_core_web_lg',disable=['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) # this was to save memory # Preprocessing # Removes empty sentences sents_train, labels_train = remove_empty_sentences(sents_train, labels_train) sents_dev, labels_dev = remove_empty_sentences(sents_dev, labels_dev) sents_test, labels_test = remove_empty_sentences(sents_test, labels_test) -# save a json, separate labels and sents, use a dictionary in python +# creates a dictionary with sentences and their labels train_dict = create_sent_label_dict(sents_train, labels_train) dev_dict = create_sent_label_dict(sents_dev, labels_dev) test_dict = create_sent_label_dict(sents_test, labels_test) total_dict = train_dict + dev_dict + test_dict -# create output files and write sentences with labels +# creates json files with reference sentences and their labels as output write_partition_file(train_dict, 'train') write_partition_file(dev_dict, 'dev') write_partition_file(test_dict, 'test') -# FLAG - Check if files were written correctly - CHECKED - +# Get reference labels list total_labels_ref_list = [sent['label'] for sent in total_dict] train_labels_ref_list = [sent['label'] for sent in train_dict] dev_labels_ref_list = [sent['label'] for sent in dev_dict] test_labels_ref_list = [sent['label'] for sent in test_dict] -# Multilabel distribution count + chart +# Multilabel distribution counts + graphs path= 'output/partition/multilabelDistribution.txt' os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, 'w') as file: @@ -158,15 +158,13 @@ def remove_empty_sentences(sents, labels): plot_distribution(counter, "Test", "multilabel") write_distribution(path, counter, "Test") -# FLAG - CHECK IF DISTRIBUTION IS BEING MEASURED CORRECTLY - Checked, total distribution is the same as the spreadsheet ones - -# Single label distribution count + chart +# Filter labels keeping only primary ones total_ref_primary_label = [label[0] for label in total_labels_ref_list] train_ref_primary_label = [label[0] for label in train_labels_ref_list] dev_ref_primary_label = [label[0] for label in dev_labels_ref_list] test_ref_primary_label = [label[0] for label in test_labels_ref_list] -### OTHER APPROACHES FOR CHOOSING THE LABEL FOR EVALUATION -- check start of implementation at the end of the code +# Only primary label distribution counts + graphs path= 'output/partition/1labelDistribution.txt' os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, 'w') as file: @@ -183,6 +181,3 @@ def remove_empty_sentences(sents, labels): counter = Counter(test_ref_primary_label) plot_distribution(counter,"Test", "1label") write_distribution(path, counter, "Test") - -# FLAG - CHECK IF DISTRIBUTION IS BEING DONE AND MEASURED CORRECTLY -# FLAG - in theory checked, but RECHECK rechecked \ No newline at end of file