Skip to content

Commit

Permalink
reran mixed dictionary but results are way worst than expected
Browse files Browse the repository at this point in the history
  • Loading branch information
gicraveiro committed Feb 5, 2022
1 parent db4dfa3 commit 7d4d294
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 31 deletions.
47 changes: 25 additions & 22 deletions AIclassifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def format_sentVector_to_SparseMatrix(vectors_list, dictionary):
counts = Counter(sent_vector)
for index, freq in counts.items():
if len(counts.items()) > 0:
sparse_vector[index] = 1 #freq/len(sent_vector) # 1 DIFFERENT CONFIGURATION POSSIBILITIES # 1
sparse_vector[index] = 1 #freq/len(sent_vector) # 1 DIFFERENT CONFIGURATION POSSIBILITIES
if (i == 0): # TO DO: OPTIMIZE, NO NEED TO CHECK THIS EVERY TURN
matrix_array = [sparse_vector]
else:
Expand Down Expand Up @@ -105,9 +105,9 @@ def create_vectors_list(sents, conversion_dict):

for bigram in sent_bigram:
if bigram not in conversion_dict:
sent_bigrams_list.append("unk")
#sent_bigrams_list.append("unk")
#unk_count += 1
#pass
pass
else:
sent_bigrams_list.append(bigram)
mixed_tokens_list.append(bigram)
Expand All @@ -119,11 +119,13 @@ def create_vectors_list(sents, conversion_dict):
sent_mixed_vector = sent_mixed_vector.astype(int)
vectors_list.append(sent_vector)
bigrams_vector.append(sent_bigrams_vector)
print(sent_mixed_vector)
mixed_vector.append(sent_mixed_vector)

#return vectors_list # unigrams
return bigrams_vector
#return mixed_vector
#return bigrams_vector
print(mixed_vector)
return mixed_vector

# def create_word_embedding(partition):

Expand Down Expand Up @@ -187,6 +189,7 @@ def create_vectors_list(sents, conversion_dict):
features_list = file.read()
features_list = features_list.split('\n')
mixed_to_numbers = create_dict(features_list)
print(mixed_to_numbers)

# WORD EMBEDDINGS FOR NN APPROACH
#ft = fasttext.load_model('cc.en.300.bin')
Expand All @@ -196,8 +199,8 @@ def create_vectors_list(sents, conversion_dict):
#dev_word_embedding_features = create_word_embedding(sents_dev)
#test_word_embedding_features = numpy.asarray(create_word_embedding(sents_test))
#print("Length of the dictionary of word representations:",len(words_to_numbers))
print("Length of the dictionary of word representations:",len(bigrams_to_numbers))
#print("Length of the dictionary of word representations:",len(mixed_to_numbers))
#print("Length of the dictionary of word representations:",len(bigrams_to_numbers))
print("Length of the dictionary of word representations:",len(mixed_to_numbers))

# FLAG - CHECK IF DICTIONARY IS BUILT CORRECTLY
# SHOULD PUNCTUATION BE UNKNOWN? BECAUSE RIGHT NOW IT IS -NOPE, FIXED
Expand All @@ -209,13 +212,13 @@ def create_vectors_list(sents, conversion_dict):
# dev_vectors_list = create_vectors_list(sents_dev, words_to_numbers)
# test_vectors_list = create_vectors_list(sents_test, words_to_numbers)

train_vectors_list = create_vectors_list(sents_train, bigrams_to_numbers)
dev_vectors_list = create_vectors_list(sents_dev, bigrams_to_numbers)
test_vectors_list = create_vectors_list(sents_test, bigrams_to_numbers)
# train_vectors_list = create_vectors_list(sents_train, bigrams_to_numbers)
# dev_vectors_list = create_vectors_list(sents_dev, bigrams_to_numbers)
# test_vectors_list = create_vectors_list(sents_test, bigrams_to_numbers)

# train_vectors_list = create_vectors_list(sents_train, mixed_to_numbers)
# dev_vectors_list = create_vectors_list(sents_dev, mixed_to_numbers)
# test_vectors_list = create_vectors_list(sents_test, mixed_to_numbers)
train_vectors_list = create_vectors_list(sents_train, mixed_to_numbers)
dev_vectors_list = create_vectors_list(sents_dev, mixed_to_numbers)
test_vectors_list = create_vectors_list(sents_test, mixed_to_numbers)

# COUNT STATISTICS - HOW MANY WORDS WERE CONSIDERED UNK, AND HOW MANY OF EACH WORD

Expand All @@ -225,13 +228,13 @@ def create_vectors_list(sents, conversion_dict):
# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, words_to_numbers)
# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, words_to_numbers)

train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, bigrams_to_numbers)
dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, bigrams_to_numbers)
test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, bigrams_to_numbers)
# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, bigrams_to_numbers)
# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, bigrams_to_numbers)
# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, bigrams_to_numbers)

# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, mixed_to_numbers)
# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, mixed_to_numbers)
# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, mixed_to_numbers)
train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, mixed_to_numbers)
dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, mixed_to_numbers)
test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, mixed_to_numbers)

# FLAG - CHECK IF SPARSE MATRIX REPRESENTATION WAS DONE CORRECTLY

Expand Down Expand Up @@ -284,8 +287,8 @@ def create_vectors_list(sents, conversion_dict):

features = {}
#for i,(token,value) in enumerate(zip(words_to_numbers, importances)):
#for i,(token,value) in enumerate(zip(mixed_to_numbers, importances)):
for i,(token,value) in enumerate(zip(bigrams_to_numbers, importances)): # IMPORTANTO TO CHANGE TO ADEQUATE DICT
for i,(token,value) in enumerate(zip(mixed_to_numbers, importances)):
#for i,(token,value) in enumerate(zip(bigrams_to_numbers, importances)): # IMPORTANTO TO CHANGE TO ADEQUATE DICT
if (value != 0):
features[token] = value
features = sorted([(value, key) for (key, value) in features.items()], reverse=True)
Expand Down Expand Up @@ -324,7 +327,7 @@ def create_vectors_list(sents, conversion_dict):
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'w') as file:
#print("Performance measures - Unigram Dictionary - MLP Word Embeddings\n", file=file)
print("Performance measures - Bigram Dictionary - Adaboost\n", file=file)
print("Performance measures - Mixed Dictionary - Adaboost\n", file=file)
#write_output_stats_file(path, "Dev", dev_labels_primary, predictions, labels)
write_output_stats_file(path, "Test", test_labels_primary, predictions, labels)

Expand Down
Binary file modified output/AI Classifier/1Label_confusion_matrix_NonNorm.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified output/AI Classifier/1Label_confusion_matrix_NormTrue.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
18 changes: 9 additions & 9 deletions output/AI Classifier/1labelPredictionsStatsTest.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
Performance measures - Bigram Dictionary - Adaboost
Performance measures - Mixed Dictionary - Adaboost

Test set:

Precision macro: 0.257
Precision Individually: [0.8 0. 0.484 0. 0. ]
Recall macro: 0.251
Recall Individually: [0.286 0. 0.968 0. 0. ]
F1 Score micro: 0.507
F1 Score macro: 0.213
F1 Score weighted: 0.386
F1 Score Individually: [0.421 0. 0.645 0. 0. ]
Precision macro: 0.246
Precision Individually: [0.364 0. 0.615 0. 0.25 ]
Recall macro: 0.289
Recall Individually: [0.571 0. 0.774 0. 0.1 ]
F1 Score micro: 0.493
F1 Score macro: 0.255
F1 Score weighted: 0.431
F1 Score Individually: [0.444 0. 0.686 0. 0.143]


0 comments on commit 7d4d294

Please sign in to comment.