Skip to content

Commit

Permalink
rerun unigram adaboost to confirm results were aligned
Browse files Browse the repository at this point in the history
  • Loading branch information
gicraveiro committed Feb 5, 2022
1 parent cab21a6 commit dd6b378
Show file tree
Hide file tree
Showing 7 changed files with 28 additions and 70 deletions.
38 changes: 19 additions & 19 deletions AIclassifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,9 @@ def create_vectors_list(sents, conversion_dict):
bigrams_vector.append(sent_bigrams_vector)
mixed_vector.append(sent_mixed_vector)

#return vectors_list
return vectors_list # unigrams
#return bigrams_vector
return mixed_vector
#return mixed_vector

# def create_word_embedding(partition):

Expand Down Expand Up @@ -195,43 +195,43 @@ def create_vectors_list(sents, conversion_dict):
#train_word_embedding_features = create_word_embedding(sents_train)
#dev_word_embedding_features = create_word_embedding(sents_dev)
#test_word_embedding_features = numpy.asarray(create_word_embedding(sents_test))
#print("Length of the dictionary of word representations:",len(words_to_numbers))
print("Length of the dictionary of word representations:",len(words_to_numbers))
# print("Length of the dictionary of word representations:",len(bigrams_to_numbers))
print("Length of the dictionary of word representations:",len(mixed_to_numbers))
#print("Length of the dictionary of word representations:",len(mixed_to_numbers))

# FLAG - CHECK IF DICTIONARY IS BUILT CORRECTLY
# SHOULD PUNCTUATION BE UNKNOWN? BECAUSE RIGHT NOW IT IS -NOPE, FIXED
# TO DO: count frequency again?
# count frequency before and after removing unknown words - ??? - ASK GABRIEL!!
# checked that it seems ok

# train_vectors_list = create_vectors_list(sents_train, words_to_numbers)
# dev_vectors_list = create_vectors_list(sents_dev, words_to_numbers)
# test_vectors_list = create_vectors_list(sents_test, words_to_numbers)
train_vectors_list = create_vectors_list(sents_train, words_to_numbers)
dev_vectors_list = create_vectors_list(sents_dev, words_to_numbers)
test_vectors_list = create_vectors_list(sents_test, words_to_numbers)

# train_vectors_list = create_vectors_list(sents_train, bigrams_to_numbers)
# dev_vectors_list = create_vectors_list(sents_dev, bigrams_to_numbers)
# test_vectors_list = create_vectors_list(sents_test, bigrams_to_numbers)

train_vectors_list = create_vectors_list(sents_train, mixed_to_numbers)
dev_vectors_list = create_vectors_list(sents_dev, mixed_to_numbers)
test_vectors_list = create_vectors_list(sents_test, mixed_to_numbers)
# train_vectors_list = create_vectors_list(sents_train, mixed_to_numbers)
# dev_vectors_list = create_vectors_list(sents_dev, mixed_to_numbers)
# test_vectors_list = create_vectors_list(sents_test, mixed_to_numbers)

# COUNT STATISTICS - HOW MANY WORDS WERE CONSIDERED UNK, AND HOW MANY OF EACH WORD

# FLAG - CHECK IF SENTENCE REPRESENTATIONS WERE DONE CORRECTLY

# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, words_to_numbers)
# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, words_to_numbers)
# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, words_to_numbers)
train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, words_to_numbers)
dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, words_to_numbers)
test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, words_to_numbers)

# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, bigrams_to_numbers)
# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, bigrams_to_numbers)
# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, bigrams_to_numbers)

train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, mixed_to_numbers)
dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, mixed_to_numbers)
test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, mixed_to_numbers)
# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, mixed_to_numbers)
# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, mixed_to_numbers)
# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, mixed_to_numbers)

# FLAG - CHECK IF SPARSE MATRIX REPRESENTATION WAS DONE CORRECTLY

Expand Down Expand Up @@ -283,8 +283,8 @@ def create_vectors_list(sents, conversion_dict):
importances = model.feature_importances_

features = {}
# for i,(token,value) in enumerate(zip(words_to_numbers, importances)):
for i,(token,value) in enumerate(zip(mixed_to_numbers, importances)):
for i,(token,value) in enumerate(zip(words_to_numbers, importances)):
#for i,(token,value) in enumerate(zip(mixed_to_numbers, importances)):
#for i,(token,value) in enumerate(zip(bigrams_to_numbers, importances)): # IMPORTANTO TO CHANGE TO ADEQUATE DICT
if (value != 0):
features[token] = value
Expand Down Expand Up @@ -324,7 +324,7 @@ def create_vectors_list(sents, conversion_dict):
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'w') as file:
#print("Performance measures - Unigram Dictionary - MLP Word Embeddings\n", file=file)
print("Performance measures - Mixed Dictionary - Adaboost\n", file=file)
print("Performance measures - Unigram Dictionary - Adaboost\n", file=file)
#write_output_stats_file(path, "Dev", dev_labels_primary, predictions, labels)
write_output_stats_file(path, "Test", test_labels_primary, predictions, labels)

Expand Down
Binary file modified output/AI Classifier/1Label_confusion_matrix_NonNorm.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified output/AI Classifier/1Label_confusion_matrix_NormTrue.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
18 changes: 9 additions & 9 deletions output/AI Classifier/1labelPredictionsStatsTest.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
Performance measures - Mixed Dictionary - Adaboost
Performance measures - Unigram Dictionary - Adaboost

Test set:

Precision macro: 0.246
Precision Individually: [0.364 0. 0.615 0. 0.25 ]
Recall macro: 0.289
Recall Individually: [0.571 0. 0.774 0. 0.1 ]
F1 Score micro: 0.493
F1 Score macro: 0.255
F1 Score weighted: 0.431
F1 Score Individually: [0.444 0. 0.686 0. 0.143]
Precision macro: 0.368
Precision Individually: [0.556 0. 0.551 0.4 0.333]
Recall macro: 0.302
Recall Individually: [0.357 0. 0.871 0.182 0.1 ]
F1 Score micro: 0.522
F1 Score macro: 0.303
F1 Score weighted: 0.467
F1 Score Individually: [0.435 0. 0.675 0.25 0.154]


14 changes: 0 additions & 14 deletions output/AI Classifier/1labelRidgePredictionsStatsDev.txt

This file was deleted.

14 changes: 0 additions & 14 deletions output/AI Classifier/1labelSGDPredictionsStatsDev.txt

This file was deleted.

14 changes: 0 additions & 14 deletions output/AI Classifier/1labelSGDPredictionsStatsTest.txt

This file was deleted.

0 comments on commit dd6b378

Please sign in to comment.