rerun with updated features the mixed dictionary but it went terribly

gicraveiro · Feb 5, 2022 · cab21a6 · cab21a6
1 parent 396f888
commit cab21a6
Show file tree

Hide file tree

Showing 6 changed files with 122 additions and 93 deletions.
diff --git a/AIclassifier.py b/AIclassifier.py
@@ -122,8 +122,8 @@ def create_vectors_list(sents, conversion_dict):
         mixed_vector.append(sent_mixed_vector)
 
     #return vectors_list
-    return bigrams_vector
-    #return mixed_vector
+    #return bigrams_vector
+    return mixed_vector
 
 # def create_word_embedding(partition):
 
@@ -183,10 +183,10 @@ def create_vectors_list(sents, conversion_dict):
 bigrams_to_numbers = create_dict(bigrams_filtered_lexicon)
 
 # Mixed dictionary
-# with open('features.txt', 'r') as file:
-#     features_list = file.read()
-# features_list = features_list.split('\n')
-# mixed_to_numbers = create_dict(features_list)
+with open('features.txt', 'r') as file:
+    features_list = file.read()
+features_list = features_list.split('\n')
+mixed_to_numbers = create_dict(features_list)
 
 # WORD EMBEDDINGS FOR NN APPROACH
 #ft = fasttext.load_model('cc.en.300.bin')
@@ -196,8 +196,8 @@ def create_vectors_list(sents, conversion_dict):
 #dev_word_embedding_features = create_word_embedding(sents_dev)
 #test_word_embedding_features = numpy.asarray(create_word_embedding(sents_test))
 #print("Length of the dictionary of word representations:",len(words_to_numbers))
-print("Length of the dictionary of word representations:",len(bigrams_to_numbers))
-#print("Length of the dictionary of word representations:",len(mixed_to_numbers))
+# print("Length of the dictionary of word representations:",len(bigrams_to_numbers))
+print("Length of the dictionary of word representations:",len(mixed_to_numbers))
 
 # FLAG - CHECK IF DICTIONARY IS BUILT CORRECTLY
 #               SHOULD PUNCTUATION BE UNKNOWN? BECAUSE RIGHT NOW IT IS -NOPE, FIXED
@@ -209,13 +209,13 @@ def create_vectors_list(sents, conversion_dict):
 # dev_vectors_list = create_vectors_list(sents_dev, words_to_numbers)
 # test_vectors_list = create_vectors_list(sents_test, words_to_numbers)
 
-train_vectors_list = create_vectors_list(sents_train, bigrams_to_numbers)
-dev_vectors_list = create_vectors_list(sents_dev, bigrams_to_numbers)
-test_vectors_list = create_vectors_list(sents_test, bigrams_to_numbers)
+# train_vectors_list = create_vectors_list(sents_train, bigrams_to_numbers)
+# dev_vectors_list = create_vectors_list(sents_dev, bigrams_to_numbers)
+# test_vectors_list = create_vectors_list(sents_test, bigrams_to_numbers)
 
-#train_vectors_list = create_vectors_list(sents_train, mixed_to_numbers)
-#dev_vectors_list = create_vectors_list(sents_dev, mixed_to_numbers)
-#test_vectors_list = create_vectors_list(sents_test, mixed_to_numbers)
+train_vectors_list = create_vectors_list(sents_train, mixed_to_numbers)
+dev_vectors_list = create_vectors_list(sents_dev, mixed_to_numbers)
+test_vectors_list = create_vectors_list(sents_test, mixed_to_numbers)
 
 # COUNT STATISTICS - HOW MANY WORDS WERE CONSIDERED UNK, AND HOW MANY OF EACH WORD
 
@@ -225,13 +225,13 @@ def create_vectors_list(sents, conversion_dict):
 # dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, words_to_numbers)
 # test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, words_to_numbers)
 
-train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, bigrams_to_numbers)
-dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, bigrams_to_numbers)
-test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, bigrams_to_numbers)
+# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, bigrams_to_numbers)
+# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, bigrams_to_numbers)
+# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, bigrams_to_numbers)
 
-#train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, mixed_to_numbers)
-#dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, mixed_to_numbers)
-#test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, mixed_to_numbers)
+train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, mixed_to_numbers)
+dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, mixed_to_numbers)
+test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, mixed_to_numbers)
 
 # FLAG - CHECK IF SPARSE MATRIX REPRESENTATION WAS DONE CORRECTLY
 
@@ -284,12 +284,11 @@ def create_vectors_list(sents, conversion_dict):
 
 features = {}
 # for i,(token,value) in enumerate(zip(words_to_numbers, importances)):
-#for i,(token,value) in enumerate(zip(mixed_to_numbers, importances)):
-for i,(token,value) in enumerate(zip(bigrams_to_numbers, importances)): # IMPORTANTO TO CHANGE TO ADEQUATE DICT
+for i,(token,value) in enumerate(zip(mixed_to_numbers, importances)):
+#for i,(token,value) in enumerate(zip(bigrams_to_numbers, importances)): # IMPORTANTO TO CHANGE TO ADEQUATE DICT
    if (value != 0):
        features[token] = value
 features = sorted([(value, key) for (key, value) in features.items()], reverse=True)
-print(features)
 for feature in features:
    print('Feature:',feature[1],'Score:',feature[0])
 
@@ -325,7 +324,7 @@ def create_vectors_list(sents, conversion_dict):
 os.makedirs(os.path.dirname(path), exist_ok=True)
 with open(path, 'w') as file:
     #print("Performance measures - Unigram Dictionary - MLP Word Embeddings\n", file=file)
-    print("Performance measures - Unigram Dictionary - Adaboost\n", file=file)
+    print("Performance measures - Mixed Dictionary - Adaboost\n", file=file)
 #write_output_stats_file(path, "Dev", dev_labels_primary, predictions, labels)
 write_output_stats_file(path, "Test", test_labels_primary, predictions, labels)
 

diff --git a/Oldfeatures.txt b/Oldfeatures.txt
@@ -0,0 +1,74 @@
+privacy
+people
+system
+safe
+possible
+should
+experience
+safer
+right
+secure
+community
+believe
+express
+we
+can
+use
+cookies
+you
+data
+or
+content
+with
+information
+they
+learn
+more
+provide
+device
+understand
+days
+collect
+who
+the
+your
+is
+to be
+you should
+help keep
+provide you
+should be
+and protect
+when people
+to enable
+and privacy
+this right
+public interest
+what you
+have a
+you with
+for people
+use cookies
+you can
+20/09/2021 14
+more about
+help us
+the information
+information about
+we may
+and others
+products including
+not about
+these terms
+to build
+with facebook
+who you
+advertising and
+that help
+we collect
+share and
+not at
+services that
+share it
+it with
+are not
diff --git a/features.txt b/features.txt
@@ -1,74 +1,30 @@
+combat
+you
+right
+your
+is
+be
 privacy
+give
 people
-system
+their
 safe
+secure
+community
+however
 possible
 should
-experience
 safer
-right
-secure
-community
 believe
 express
-we
-can
-use
-cookies
-you
-data
-or
-content
-with
-information
-they
-learn
-more
-provide
-device
-understand
-days
-collect
-who
-the
-your
-is
 to be
 you should
+to enable
+this right
+should be
+not at
 help keep
+when people
 provide you
-should be
 and protect
-when people
-to enable
-and privacy
-this right
-public interest
-what you
-have a
-you with
-for people
-use cookies
 you can
-20/09/2021 14
-more about
-help us
-the information
-information about
-we may
-and others
-products including
-not about
-these terms
-to build
-with facebook
-who you
-advertising and
-that help
-we collect
-share and
-not at
-services that
-share it
-it with
-are not
diff --git a/output/AI Classifier/1Label_confusion_matrix_NonNorm.png b/output/AI Classifier/1Label_confusion_matrix_NonNorm.png
diff --git a/output/AI Classifier/1Label_confusion_matrix_NormTrue.png b/output/AI Classifier/1Label_confusion_matrix_NormTrue.png
diff --git a/output/AI Classifier/1labelPredictionsStatsTest.txt b/output/AI Classifier/1labelPredictionsStatsTest.txt
@@ -1,14 +1,14 @@
-Performance measures - Unigram Dictionary - Adaboost
+Performance measures - Mixed Dictionary - Adaboost
 
 Test set:
 
-Precision macro: 0.257
-Precision Individually: [0.8   0.    0.484 0.    0.   ]
-Recall macro: 0.251
-Recall Individually: [0.286 0.    0.968 0.    0.   ]
-F1 Score micro: 0.507
-F1 Score macro: 0.213
-F1 Score weighted: 0.386
-F1 Score Individually: [0.421 0.    0.645 0.    0.   ]
+Precision macro: 0.246
+Precision Individually: [0.364 0.    0.615 0.    0.25 ]
+Recall macro: 0.289
+Recall Individually: [0.571 0.    0.774 0.    0.1  ]
+F1 Score micro: 0.493
+F1 Score macro: 0.255
+F1 Score weighted: 0.431
+F1 Score Individually: [0.444 0.    0.686 0.    0.143]