Skip to content

Commit

Permalink
rerun with updated features the mixed dictionary but it went terribly
Browse files Browse the repository at this point in the history
  • Loading branch information
gicraveiro committed Feb 5, 2022
1 parent 396f888 commit cab21a6
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 93 deletions.
47 changes: 23 additions & 24 deletions AIclassifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,8 @@ def create_vectors_list(sents, conversion_dict):
mixed_vector.append(sent_mixed_vector)

#return vectors_list
return bigrams_vector
#return mixed_vector
#return bigrams_vector
return mixed_vector

# def create_word_embedding(partition):

Expand Down Expand Up @@ -183,10 +183,10 @@ def create_vectors_list(sents, conversion_dict):
bigrams_to_numbers = create_dict(bigrams_filtered_lexicon)

# Mixed dictionary
# with open('features.txt', 'r') as file:
# features_list = file.read()
# features_list = features_list.split('\n')
# mixed_to_numbers = create_dict(features_list)
with open('features.txt', 'r') as file:
features_list = file.read()
features_list = features_list.split('\n')
mixed_to_numbers = create_dict(features_list)

# WORD EMBEDDINGS FOR NN APPROACH
#ft = fasttext.load_model('cc.en.300.bin')
Expand All @@ -196,8 +196,8 @@ def create_vectors_list(sents, conversion_dict):
#dev_word_embedding_features = create_word_embedding(sents_dev)
#test_word_embedding_features = numpy.asarray(create_word_embedding(sents_test))
#print("Length of the dictionary of word representations:",len(words_to_numbers))
print("Length of the dictionary of word representations:",len(bigrams_to_numbers))
#print("Length of the dictionary of word representations:",len(mixed_to_numbers))
# print("Length of the dictionary of word representations:",len(bigrams_to_numbers))
print("Length of the dictionary of word representations:",len(mixed_to_numbers))

# FLAG - CHECK IF DICTIONARY IS BUILT CORRECTLY
# SHOULD PUNCTUATION BE UNKNOWN? BECAUSE RIGHT NOW IT IS -NOPE, FIXED
Expand All @@ -209,13 +209,13 @@ def create_vectors_list(sents, conversion_dict):
# dev_vectors_list = create_vectors_list(sents_dev, words_to_numbers)
# test_vectors_list = create_vectors_list(sents_test, words_to_numbers)

train_vectors_list = create_vectors_list(sents_train, bigrams_to_numbers)
dev_vectors_list = create_vectors_list(sents_dev, bigrams_to_numbers)
test_vectors_list = create_vectors_list(sents_test, bigrams_to_numbers)
# train_vectors_list = create_vectors_list(sents_train, bigrams_to_numbers)
# dev_vectors_list = create_vectors_list(sents_dev, bigrams_to_numbers)
# test_vectors_list = create_vectors_list(sents_test, bigrams_to_numbers)

#train_vectors_list = create_vectors_list(sents_train, mixed_to_numbers)
#dev_vectors_list = create_vectors_list(sents_dev, mixed_to_numbers)
#test_vectors_list = create_vectors_list(sents_test, mixed_to_numbers)
train_vectors_list = create_vectors_list(sents_train, mixed_to_numbers)
dev_vectors_list = create_vectors_list(sents_dev, mixed_to_numbers)
test_vectors_list = create_vectors_list(sents_test, mixed_to_numbers)

# COUNT STATISTICS - HOW MANY WORDS WERE CONSIDERED UNK, AND HOW MANY OF EACH WORD

Expand All @@ -225,13 +225,13 @@ def create_vectors_list(sents, conversion_dict):
# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, words_to_numbers)
# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, words_to_numbers)

train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, bigrams_to_numbers)
dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, bigrams_to_numbers)
test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, bigrams_to_numbers)
# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, bigrams_to_numbers)
# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, bigrams_to_numbers)
# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, bigrams_to_numbers)

#train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, mixed_to_numbers)
#dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, mixed_to_numbers)
#test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, mixed_to_numbers)
train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, mixed_to_numbers)
dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, mixed_to_numbers)
test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, mixed_to_numbers)

# FLAG - CHECK IF SPARSE MATRIX REPRESENTATION WAS DONE CORRECTLY

Expand Down Expand Up @@ -284,12 +284,11 @@ def create_vectors_list(sents, conversion_dict):

features = {}
# for i,(token,value) in enumerate(zip(words_to_numbers, importances)):
#for i,(token,value) in enumerate(zip(mixed_to_numbers, importances)):
for i,(token,value) in enumerate(zip(bigrams_to_numbers, importances)): # IMPORTANTO TO CHANGE TO ADEQUATE DICT
for i,(token,value) in enumerate(zip(mixed_to_numbers, importances)):
#for i,(token,value) in enumerate(zip(bigrams_to_numbers, importances)): # IMPORTANTO TO CHANGE TO ADEQUATE DICT
if (value != 0):
features[token] = value
features = sorted([(value, key) for (key, value) in features.items()], reverse=True)
print(features)
for feature in features:
print('Feature:',feature[1],'Score:',feature[0])

Expand Down Expand Up @@ -325,7 +324,7 @@ def create_vectors_list(sents, conversion_dict):
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'w') as file:
#print("Performance measures - Unigram Dictionary - MLP Word Embeddings\n", file=file)
print("Performance measures - Unigram Dictionary - Adaboost\n", file=file)
print("Performance measures - Mixed Dictionary - Adaboost\n", file=file)
#write_output_stats_file(path, "Dev", dev_labels_primary, predictions, labels)
write_output_stats_file(path, "Test", test_labels_primary, predictions, labels)

Expand Down
74 changes: 74 additions & 0 deletions Oldfeatures.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
privacy
people
system
safe
possible
should
experience
safer
right
secure
community
believe
express
we
can
use
cookies
you
data
or
content
with
information
they
learn
more
provide
device
understand
days
collect
who
the
your
is
to be
you should
help keep
provide you
should be
and protect
when people
to enable
and privacy
this right
public interest
what you
have a
you with
for people
use cookies
you can
20/09/2021 14
more about
help us
the information
information about
we may
and others
products including
not about
these terms
to build
with facebook
who you
advertising and
that help
we collect
share and
not at
services that
share it
it with
are not
76 changes: 16 additions & 60 deletions features.txt
Original file line number Diff line number Diff line change
@@ -1,74 +1,30 @@
combat
you
right
your
is
be
privacy
give
people
system
their
safe
secure
community
however
possible
should
experience
safer
right
secure
community
believe
express
we
can
use
cookies
you
data
or
content
with
information
they
learn
more
provide
device
understand
days
collect
who
the
your
is
to be
you should
to enable
this right
should be
not at
help keep
when people
provide you
should be
and protect
when people
to enable
and privacy
this right
public interest
what you
have a
you with
for people
use cookies
you can
20/09/2021 14
more about
help us
the information
information about
we may
and others
products including
not about
these terms
to build
with facebook
who you
advertising and
that help
we collect
share and
not at
services that
share it
it with
are not
Binary file modified output/AI Classifier/1Label_confusion_matrix_NonNorm.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified output/AI Classifier/1Label_confusion_matrix_NormTrue.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
18 changes: 9 additions & 9 deletions output/AI Classifier/1labelPredictionsStatsTest.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
Performance measures - Unigram Dictionary - Adaboost
Performance measures - Mixed Dictionary - Adaboost

Test set:

Precision macro: 0.257
Precision Individually: [0.8 0. 0.484 0. 0. ]
Recall macro: 0.251
Recall Individually: [0.286 0. 0.968 0. 0. ]
F1 Score micro: 0.507
F1 Score macro: 0.213
F1 Score weighted: 0.386
F1 Score Individually: [0.421 0. 0.645 0. 0. ]
Precision macro: 0.246
Precision Individually: [0.364 0. 0.615 0. 0.25 ]
Recall macro: 0.289
Recall Individually: [0.571 0. 0.774 0. 0.1 ]
F1 Score micro: 0.493
F1 Score macro: 0.255
F1 Score weighted: 0.431
F1 Score Individually: [0.444 0. 0.686 0. 0.143]


0 comments on commit cab21a6

Please sign in to comment.