ran unigrams with lr 0.5 and nest 100

gicraveiro · Feb 5, 2022 · c2ee829 · c2ee829
1 parent 935655a
commit c2ee829
Show file tree

Hide file tree

Showing 6 changed files with 91 additions and 29 deletions.
diff --git a/AIclassifier.py b/AIclassifier.py
@@ -122,10 +122,9 @@ def create_vectors_list(sents, conversion_dict):
         print(sent_mixed_vector)
         mixed_vector.append(sent_mixed_vector)
 
-    #return vectors_list # unigrams
+    return vectors_list # unigrams
     #return bigrams_vector
-    print(mixed_vector)
-    return mixed_vector
+    #return mixed_vector
 
 # def create_word_embedding(partition):
 
@@ -198,43 +197,43 @@ def create_vectors_list(sents, conversion_dict):
 #train_word_embedding_features = create_word_embedding(sents_train)
 #dev_word_embedding_features = create_word_embedding(sents_dev)
 #test_word_embedding_features = numpy.asarray(create_word_embedding(sents_test))
-#print("Length of the dictionary of word representations:",len(words_to_numbers))
+print("Length of the dictionary of word representations:",len(words_to_numbers))
 #print("Length of the dictionary of word representations:",len(bigrams_to_numbers))
-print("Length of the dictionary of word representations:",len(mixed_to_numbers))
+#print("Length of the dictionary of word representations:",len(mixed_to_numbers))
 
 # FLAG - CHECK IF DICTIONARY IS BUILT CORRECTLY
 #               SHOULD PUNCTUATION BE UNKNOWN? BECAUSE RIGHT NOW IT IS -NOPE, FIXED
 # TO DO: count frequency again?
 # count frequency before and after removing unknown words - ??? - ASK GABRIEL!!
 # checked that it seems ok
 
-# train_vectors_list = create_vectors_list(sents_train, words_to_numbers)
-# dev_vectors_list = create_vectors_list(sents_dev, words_to_numbers)
-# test_vectors_list = create_vectors_list(sents_test, words_to_numbers)
+train_vectors_list = create_vectors_list(sents_train, words_to_numbers)
+dev_vectors_list = create_vectors_list(sents_dev, words_to_numbers)
+test_vectors_list = create_vectors_list(sents_test, words_to_numbers)
 
 # train_vectors_list = create_vectors_list(sents_train, bigrams_to_numbers)
 # dev_vectors_list = create_vectors_list(sents_dev, bigrams_to_numbers)
 # test_vectors_list = create_vectors_list(sents_test, bigrams_to_numbers)
 
-train_vectors_list = create_vectors_list(sents_train, mixed_to_numbers)
-dev_vectors_list = create_vectors_list(sents_dev, mixed_to_numbers)
-test_vectors_list = create_vectors_list(sents_test, mixed_to_numbers)
+# train_vectors_list = create_vectors_list(sents_train, mixed_to_numbers)
+# dev_vectors_list = create_vectors_list(sents_dev, mixed_to_numbers)
+# test_vectors_list = create_vectors_list(sents_test, mixed_to_numbers)
 
 # COUNT STATISTICS - HOW MANY WORDS WERE CONSIDERED UNK, AND HOW MANY OF EACH WORD
 
 # FLAG - CHECK IF SENTENCE REPRESENTATIONS WERE DONE CORRECTLY
 
-# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, words_to_numbers)
-# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, words_to_numbers)
-# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, words_to_numbers)
+train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, words_to_numbers)
+dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, words_to_numbers)
+test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, words_to_numbers)
 
 # train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, bigrams_to_numbers)
 # dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, bigrams_to_numbers)
 # test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, bigrams_to_numbers)
 
-train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, mixed_to_numbers)
-dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, mixed_to_numbers)
-test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, mixed_to_numbers)
+# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, mixed_to_numbers)
+# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, mixed_to_numbers)
+# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, mixed_to_numbers)
 
 # FLAG - CHECK IF SPARSE MATRIX REPRESENTATION WAS DONE CORRECTLY
 
@@ -286,8 +285,8 @@ def create_vectors_list(sents, conversion_dict):
 importances = model.feature_importances_
 
 features = {}
-#for i,(token,value) in enumerate(zip(words_to_numbers, importances)):
-for i,(token,value) in enumerate(zip(mixed_to_numbers, importances)):
+for i,(token,value) in enumerate(zip(words_to_numbers, importances)):
+#for i,(token,value) in enumerate(zip(mixed_to_numbers, importances)):
 #for i,(token,value) in enumerate(zip(bigrams_to_numbers, importances)): # IMPORTANTO TO CHANGE TO ADEQUATE DICT
    if (value != 0):
        features[token] = value
@@ -327,7 +326,7 @@ def create_vectors_list(sents, conversion_dict):
 os.makedirs(os.path.dirname(path), exist_ok=True)
 with open(path, 'w') as file:
     #print("Performance measures - Unigram Dictionary - MLP Word Embeddings\n", file=file)
-    print("Performance measures - Mixed Dictionary - Adaboost\n", file=file)
+    print("Performance measures - Unigram Dictionary - Adaboost\n", file=file)
 #write_output_stats_file(path, "Dev", dev_labels_primary, predictions, labels)
 write_output_stats_file(path, "Test", test_labels_primary, predictions, labels)
 

diff --git a/featureslr0.5nEst100.txt b/featureslr0.5nEst100.txt
@@ -0,0 +1,20 @@
+privacy
+secure
+community
+should
+possible
+express
+believe
+right
+people
+your
+you
+we
+they
+the
+safe
+or
+is
+information
+data
+cookies
diff --git a/featureslr1nEst50.txt b/featureslr1nEst50.txt
@@ -0,0 +1,43 @@
+combat
+you
+right
+your
+is
+be
+privacy
+give
+people
+their
+safe
+secure
+community
+however
+possible
+should
+safer
+believe
+express
+experience
+can
+no
+connect
+they
+public
+example
+cookies
+content
+and
+to be
+you should
+to enable
+this right
+should be
+not at
+help keep
+when people
+provide you
+and protect
+you can
+have a
+you with
+with the
diff --git a/output/AI Classifier/1Label_confusion_matrix_NonNorm.png b/output/AI Classifier/1Label_confusion_matrix_NonNorm.png
diff --git a/output/AI Classifier/1Label_confusion_matrix_NormTrue.png b/output/AI Classifier/1Label_confusion_matrix_NormTrue.png
diff --git a/output/AI Classifier/1labelPredictionsStatsTest.txt b/output/AI Classifier/1labelPredictionsStatsTest.txt
@@ -1,14 +1,14 @@
-Performance measures - Mixed Dictionary - Adaboost
+Performance measures - Unigram Dictionary - Adaboost
 
 Test set:
 
-Precision macro: 0.333
-Precision Individually: [0.5   0.    0.58  0.25  0.333]
-Recall macro: 0.297
-Recall Individually: [0.357 0.    0.935 0.091 0.1  ]
-F1 Score micro: 0.537
-F1 Score macro: 0.284
-F1 Score weighted: 0.463
-F1 Score Individually: [0.417 0.    0.716 0.133 0.154]
+Precision macro: 0.247
+Precision Individually: [0.333 0.    0.566 0.    0.333]
+Recall macro: 0.256
+Recall Individually: [0.214 0.    0.968 0.    0.1  ]
+F1 Score micro: 0.507
+F1 Score macro: 0.226
+F1 Score weighted: 0.408
+F1 Score Individually: [0.261 0.    0.714 0.    0.154]