diff --git a/AIclassifier.py b/AIclassifier.py index 2b36f4e..8ac6170 100644 --- a/AIclassifier.py +++ b/AIclassifier.py @@ -122,10 +122,9 @@ def create_vectors_list(sents, conversion_dict): print(sent_mixed_vector) mixed_vector.append(sent_mixed_vector) - #return vectors_list # unigrams + return vectors_list # unigrams #return bigrams_vector - print(mixed_vector) - return mixed_vector + #return mixed_vector # def create_word_embedding(partition): @@ -198,9 +197,9 @@ def create_vectors_list(sents, conversion_dict): #train_word_embedding_features = create_word_embedding(sents_train) #dev_word_embedding_features = create_word_embedding(sents_dev) #test_word_embedding_features = numpy.asarray(create_word_embedding(sents_test)) -#print("Length of the dictionary of word representations:",len(words_to_numbers)) +print("Length of the dictionary of word representations:",len(words_to_numbers)) #print("Length of the dictionary of word representations:",len(bigrams_to_numbers)) -print("Length of the dictionary of word representations:",len(mixed_to_numbers)) +#print("Length of the dictionary of word representations:",len(mixed_to_numbers)) # FLAG - CHECK IF DICTIONARY IS BUILT CORRECTLY # SHOULD PUNCTUATION BE UNKNOWN? BECAUSE RIGHT NOW IT IS -NOPE, FIXED @@ -208,33 +207,33 @@ def create_vectors_list(sents, conversion_dict): # count frequency before and after removing unknown words - ??? - ASK GABRIEL!! # checked that it seems ok -# train_vectors_list = create_vectors_list(sents_train, words_to_numbers) -# dev_vectors_list = create_vectors_list(sents_dev, words_to_numbers) -# test_vectors_list = create_vectors_list(sents_test, words_to_numbers) +train_vectors_list = create_vectors_list(sents_train, words_to_numbers) +dev_vectors_list = create_vectors_list(sents_dev, words_to_numbers) +test_vectors_list = create_vectors_list(sents_test, words_to_numbers) # train_vectors_list = create_vectors_list(sents_train, bigrams_to_numbers) # dev_vectors_list = create_vectors_list(sents_dev, bigrams_to_numbers) # test_vectors_list = create_vectors_list(sents_test, bigrams_to_numbers) -train_vectors_list = create_vectors_list(sents_train, mixed_to_numbers) -dev_vectors_list = create_vectors_list(sents_dev, mixed_to_numbers) -test_vectors_list = create_vectors_list(sents_test, mixed_to_numbers) +# train_vectors_list = create_vectors_list(sents_train, mixed_to_numbers) +# dev_vectors_list = create_vectors_list(sents_dev, mixed_to_numbers) +# test_vectors_list = create_vectors_list(sents_test, mixed_to_numbers) # COUNT STATISTICS - HOW MANY WORDS WERE CONSIDERED UNK, AND HOW MANY OF EACH WORD # FLAG - CHECK IF SENTENCE REPRESENTATIONS WERE DONE CORRECTLY -# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, words_to_numbers) -# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, words_to_numbers) -# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, words_to_numbers) +train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, words_to_numbers) +dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, words_to_numbers) +test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, words_to_numbers) # train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, bigrams_to_numbers) # dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, bigrams_to_numbers) # test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, bigrams_to_numbers) -train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, mixed_to_numbers) -dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, mixed_to_numbers) -test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, mixed_to_numbers) +# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, mixed_to_numbers) +# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, mixed_to_numbers) +# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, mixed_to_numbers) # FLAG - CHECK IF SPARSE MATRIX REPRESENTATION WAS DONE CORRECTLY @@ -286,8 +285,8 @@ def create_vectors_list(sents, conversion_dict): importances = model.feature_importances_ features = {} -#for i,(token,value) in enumerate(zip(words_to_numbers, importances)): -for i,(token,value) in enumerate(zip(mixed_to_numbers, importances)): +for i,(token,value) in enumerate(zip(words_to_numbers, importances)): +#for i,(token,value) in enumerate(zip(mixed_to_numbers, importances)): #for i,(token,value) in enumerate(zip(bigrams_to_numbers, importances)): # IMPORTANTO TO CHANGE TO ADEQUATE DICT if (value != 0): features[token] = value @@ -327,7 +326,7 @@ def create_vectors_list(sents, conversion_dict): os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, 'w') as file: #print("Performance measures - Unigram Dictionary - MLP Word Embeddings\n", file=file) - print("Performance measures - Mixed Dictionary - Adaboost\n", file=file) + print("Performance measures - Unigram Dictionary - Adaboost\n", file=file) #write_output_stats_file(path, "Dev", dev_labels_primary, predictions, labels) write_output_stats_file(path, "Test", test_labels_primary, predictions, labels) diff --git a/featureslr0.5nEst100.txt b/featureslr0.5nEst100.txt new file mode 100644 index 0000000..20c927b --- /dev/null +++ b/featureslr0.5nEst100.txt @@ -0,0 +1,20 @@ +privacy +secure +community +should +possible +express +believe +right +people +your +you +we +they +the +safe +or +is +information +data +cookies \ No newline at end of file diff --git a/featureslr1nEst50.txt b/featureslr1nEst50.txt new file mode 100644 index 0000000..b5946b4 --- /dev/null +++ b/featureslr1nEst50.txt @@ -0,0 +1,43 @@ +combat +you +right +your +is +be +privacy +give +people +their +safe +secure +community +however +possible +should +safer +believe +express +experience +can +no +connect +they +public +example +cookies +content +and +to be +you should +to enable +this right +should be +not at +help keep +when people +provide you +and protect +you can +have a +you with +with the \ No newline at end of file diff --git a/output/AI Classifier/1Label_confusion_matrix_NonNorm.png b/output/AI Classifier/1Label_confusion_matrix_NonNorm.png index fda3655..f7937fd 100644 Binary files a/output/AI Classifier/1Label_confusion_matrix_NonNorm.png and b/output/AI Classifier/1Label_confusion_matrix_NonNorm.png differ diff --git a/output/AI Classifier/1Label_confusion_matrix_NormTrue.png b/output/AI Classifier/1Label_confusion_matrix_NormTrue.png index 48c9434..8829d14 100644 Binary files a/output/AI Classifier/1Label_confusion_matrix_NormTrue.png and b/output/AI Classifier/1Label_confusion_matrix_NormTrue.png differ diff --git a/output/AI Classifier/1labelPredictionsStatsTest.txt b/output/AI Classifier/1labelPredictionsStatsTest.txt index a4e14ba..2e5a4aa 100644 --- a/output/AI Classifier/1labelPredictionsStatsTest.txt +++ b/output/AI Classifier/1labelPredictionsStatsTest.txt @@ -1,14 +1,14 @@ -Performance measures - Mixed Dictionary - Adaboost +Performance measures - Unigram Dictionary - Adaboost Test set: -Precision macro: 0.333 -Precision Individually: [0.5 0. 0.58 0.25 0.333] -Recall macro: 0.297 -Recall Individually: [0.357 0. 0.935 0.091 0.1 ] -F1 Score micro: 0.537 -F1 Score macro: 0.284 -F1 Score weighted: 0.463 -F1 Score Individually: [0.417 0. 0.716 0.133 0.154] +Precision macro: 0.247 +Precision Individually: [0.333 0. 0.566 0. 0.333] +Recall macro: 0.256 +Recall Individually: [0.214 0. 0.968 0. 0.1 ] +F1 Score micro: 0.507 +F1 Score macro: 0.226 +F1 Score weighted: 0.408 +F1 Score Individually: [0.261 0. 0.714 0. 0.154]