Skip to content

Commit

Permalink
ran unigrams with lr 0.5 and nest 100
Browse files Browse the repository at this point in the history
  • Loading branch information
gicraveiro committed Feb 5, 2022
1 parent 935655a commit c2ee829
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 29 deletions.
39 changes: 19 additions & 20 deletions AIclassifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,9 @@ def create_vectors_list(sents, conversion_dict):
print(sent_mixed_vector)
mixed_vector.append(sent_mixed_vector)

#return vectors_list # unigrams
return vectors_list # unigrams
#return bigrams_vector
print(mixed_vector)
return mixed_vector
#return mixed_vector

# def create_word_embedding(partition):

Expand Down Expand Up @@ -198,43 +197,43 @@ def create_vectors_list(sents, conversion_dict):
#train_word_embedding_features = create_word_embedding(sents_train)
#dev_word_embedding_features = create_word_embedding(sents_dev)
#test_word_embedding_features = numpy.asarray(create_word_embedding(sents_test))
#print("Length of the dictionary of word representations:",len(words_to_numbers))
print("Length of the dictionary of word representations:",len(words_to_numbers))
#print("Length of the dictionary of word representations:",len(bigrams_to_numbers))
print("Length of the dictionary of word representations:",len(mixed_to_numbers))
#print("Length of the dictionary of word representations:",len(mixed_to_numbers))

# FLAG - CHECK IF DICTIONARY IS BUILT CORRECTLY
# SHOULD PUNCTUATION BE UNKNOWN? BECAUSE RIGHT NOW IT IS -NOPE, FIXED
# TO DO: count frequency again?
# count frequency before and after removing unknown words - ??? - ASK GABRIEL!!
# checked that it seems ok

# train_vectors_list = create_vectors_list(sents_train, words_to_numbers)
# dev_vectors_list = create_vectors_list(sents_dev, words_to_numbers)
# test_vectors_list = create_vectors_list(sents_test, words_to_numbers)
train_vectors_list = create_vectors_list(sents_train, words_to_numbers)
dev_vectors_list = create_vectors_list(sents_dev, words_to_numbers)
test_vectors_list = create_vectors_list(sents_test, words_to_numbers)

# train_vectors_list = create_vectors_list(sents_train, bigrams_to_numbers)
# dev_vectors_list = create_vectors_list(sents_dev, bigrams_to_numbers)
# test_vectors_list = create_vectors_list(sents_test, bigrams_to_numbers)

train_vectors_list = create_vectors_list(sents_train, mixed_to_numbers)
dev_vectors_list = create_vectors_list(sents_dev, mixed_to_numbers)
test_vectors_list = create_vectors_list(sents_test, mixed_to_numbers)
# train_vectors_list = create_vectors_list(sents_train, mixed_to_numbers)
# dev_vectors_list = create_vectors_list(sents_dev, mixed_to_numbers)
# test_vectors_list = create_vectors_list(sents_test, mixed_to_numbers)

# COUNT STATISTICS - HOW MANY WORDS WERE CONSIDERED UNK, AND HOW MANY OF EACH WORD

# FLAG - CHECK IF SENTENCE REPRESENTATIONS WERE DONE CORRECTLY

# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, words_to_numbers)
# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, words_to_numbers)
# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, words_to_numbers)
train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, words_to_numbers)
dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, words_to_numbers)
test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, words_to_numbers)

# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, bigrams_to_numbers)
# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, bigrams_to_numbers)
# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, bigrams_to_numbers)

train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, mixed_to_numbers)
dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, mixed_to_numbers)
test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, mixed_to_numbers)
# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, mixed_to_numbers)
# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, mixed_to_numbers)
# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, mixed_to_numbers)

# FLAG - CHECK IF SPARSE MATRIX REPRESENTATION WAS DONE CORRECTLY

Expand Down Expand Up @@ -286,8 +285,8 @@ def create_vectors_list(sents, conversion_dict):
importances = model.feature_importances_

features = {}
#for i,(token,value) in enumerate(zip(words_to_numbers, importances)):
for i,(token,value) in enumerate(zip(mixed_to_numbers, importances)):
for i,(token,value) in enumerate(zip(words_to_numbers, importances)):
#for i,(token,value) in enumerate(zip(mixed_to_numbers, importances)):
#for i,(token,value) in enumerate(zip(bigrams_to_numbers, importances)): # IMPORTANTO TO CHANGE TO ADEQUATE DICT
if (value != 0):
features[token] = value
Expand Down Expand Up @@ -327,7 +326,7 @@ def create_vectors_list(sents, conversion_dict):
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'w') as file:
#print("Performance measures - Unigram Dictionary - MLP Word Embeddings\n", file=file)
print("Performance measures - Mixed Dictionary - Adaboost\n", file=file)
print("Performance measures - Unigram Dictionary - Adaboost\n", file=file)
#write_output_stats_file(path, "Dev", dev_labels_primary, predictions, labels)
write_output_stats_file(path, "Test", test_labels_primary, predictions, labels)

Expand Down
20 changes: 20 additions & 0 deletions featureslr0.5nEst100.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
privacy
secure
community
should
possible
express
believe
right
people
your
you
we
they
the
safe
or
is
information
data
cookies
43 changes: 43 additions & 0 deletions featureslr1nEst50.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
combat
you
right
your
is
be
privacy
give
people
their
safe
secure
community
however
possible
should
safer
believe
express
experience
can
no
connect
they
public
example
cookies
content
and
to be
you should
to enable
this right
should be
not at
help keep
when people
provide you
and protect
you can
have a
you with
with the
Binary file modified output/AI Classifier/1Label_confusion_matrix_NonNorm.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified output/AI Classifier/1Label_confusion_matrix_NormTrue.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
18 changes: 9 additions & 9 deletions output/AI Classifier/1labelPredictionsStatsTest.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
Performance measures - Mixed Dictionary - Adaboost
Performance measures - Unigram Dictionary - Adaboost

Test set:

Precision macro: 0.333
Precision Individually: [0.5 0. 0.58 0.25 0.333]
Recall macro: 0.297
Recall Individually: [0.357 0. 0.935 0.091 0.1 ]
F1 Score micro: 0.537
F1 Score macro: 0.284
F1 Score weighted: 0.463
F1 Score Individually: [0.417 0. 0.716 0.133 0.154]
Precision macro: 0.247
Precision Individually: [0.333 0. 0.566 0. 0.333]
Recall macro: 0.256
Recall Individually: [0.214 0. 0.968 0. 0.1 ]
F1 Score micro: 0.507
F1 Score macro: 0.226
F1 Score weighted: 0.408
F1 Score Individually: [0.261 0. 0.714 0. 0.154]


0 comments on commit c2ee829

Please sign in to comment.