Skip to content

Commit

Permalink
updated input path in partition
Browse files Browse the repository at this point in the history
  • Loading branch information
gicraveiro committed Feb 8, 2022
1 parent 1835b16 commit 1fa79ea
Showing 1 changed file with 9 additions and 14 deletions.
23 changes: 9 additions & 14 deletions partition.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Divides annotated dataset in train set, dev set and test set
import pandas as pd
import re
from sklearn.model_selection import train_test_split
Expand Down Expand Up @@ -95,7 +96,7 @@ def remove_empty_sentences(sents, labels):
# MAIN

# Reads annotation table from file .csv saved locally and creates labels and senences list
annotation = pd.read_csv("data/Privacy/Facebook/Annotated/AnnotatedMultiLabelDataset.csv")
annotation = pd.read_csv("data/Facebook/Annotated/AnnotatedMultiLabelDataset.csv")
sents = annotation['Sentences'].values
labels1 = annotation['Primary Label'].values
labels2 = annotation['Secondary Label'].values
Expand All @@ -115,32 +116,31 @@ def remove_empty_sentences(sents, labels):
# Partitions remaining 20% into dev set (10%) and test set (10%)
sents_test, sents_dev, labels_test, labels_dev = train_test_split(sents_test,labels_test, test_size=0.5, stratify=labels_test, random_state=1111111)

nlp = spacy.load('en_core_web_lg',disable=['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
nlp = spacy.load('en_core_web_lg',disable=['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']) # this was to save memory

# Preprocessing
# Removes empty sentences
sents_train, labels_train = remove_empty_sentences(sents_train, labels_train)
sents_dev, labels_dev = remove_empty_sentences(sents_dev, labels_dev)
sents_test, labels_test = remove_empty_sentences(sents_test, labels_test)
# save a json, separate labels and sents, use a dictionary in python
# creates a dictionary with sentences and their labels
train_dict = create_sent_label_dict(sents_train, labels_train)
dev_dict = create_sent_label_dict(sents_dev, labels_dev)
test_dict = create_sent_label_dict(sents_test, labels_test)
total_dict = train_dict + dev_dict + test_dict

# create output files and write sentences with labels
# creates json files with reference sentences and their labels as output
write_partition_file(train_dict, 'train')
write_partition_file(dev_dict, 'dev')
write_partition_file(test_dict, 'test')

# FLAG - Check if files were written correctly - CHECKED

# Get reference labels list
total_labels_ref_list = [sent['label'] for sent in total_dict]
train_labels_ref_list = [sent['label'] for sent in train_dict]
dev_labels_ref_list = [sent['label'] for sent in dev_dict]
test_labels_ref_list = [sent['label'] for sent in test_dict]

# Multilabel distribution count + chart
# Multilabel distribution counts + graphs
path= 'output/partition/multilabelDistribution.txt'
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'w') as file:
Expand All @@ -158,15 +158,13 @@ def remove_empty_sentences(sents, labels):
plot_distribution(counter, "Test", "multilabel")
write_distribution(path, counter, "Test")

# FLAG - CHECK IF DISTRIBUTION IS BEING MEASURED CORRECTLY - Checked, total distribution is the same as the spreadsheet ones

# Single label distribution count + chart
# Filter labels keeping only primary ones
total_ref_primary_label = [label[0] for label in total_labels_ref_list]
train_ref_primary_label = [label[0] for label in train_labels_ref_list]
dev_ref_primary_label = [label[0] for label in dev_labels_ref_list]
test_ref_primary_label = [label[0] for label in test_labels_ref_list]

### OTHER APPROACHES FOR CHOOSING THE LABEL FOR EVALUATION -- check start of implementation at the end of the code
# Only primary label distribution counts + graphs
path= 'output/partition/1labelDistribution.txt'
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'w') as file:
Expand All @@ -183,6 +181,3 @@ def remove_empty_sentences(sents, labels):
counter = Counter(test_ref_primary_label)
plot_distribution(counter,"Test", "1label")
write_distribution(path, counter, "Test")

# FLAG - CHECK IF DISTRIBUTION IS BEING DONE AND MEASURED CORRECTLY
# FLAG - in theory checked, but RECHECK rechecked

0 comments on commit 1fa79ea

Please sign in to comment.