diff --git a/AIclassifier.py b/AIclassifier.py index be8ed71..7734d9b 100644 --- a/AIclassifier.py +++ b/AIclassifier.py @@ -123,8 +123,8 @@ def create_vectors_list(sents, conversion_dict): mixed_vector.append(sent_mixed_vector) #return vectors_list # unigrams - return bigrams_vector - #return mixed_vector + #return bigrams_vector + return mixed_vector # def create_word_embedding(partition): @@ -184,7 +184,8 @@ def create_vectors_list(sents, conversion_dict): bigrams_to_numbers = create_dict(bigrams_filtered_lexicon) # Mixed dictionary -with open('features.txt', 'r') as file: +with open('featureslr0.5nEst100.txt', 'r') as file: +#with open('features.txt', 'r') as file: features_list = file.read() features_list = features_list.split('\n') mixed_to_numbers = create_dict(features_list) @@ -198,8 +199,8 @@ def create_vectors_list(sents, conversion_dict): #dev_word_embedding_features = create_word_embedding(sents_dev) #test_word_embedding_features = numpy.asarray(create_word_embedding(sents_test)) #print("Length of the dictionary of word representations:",len(words_to_numbers)) -print("Length of the dictionary of word representations:",len(bigrams_to_numbers)) -#print("Length of the dictionary of word representations:",len(mixed_to_numbers)) +#print("Length of the dictionary of word representations:",len(bigrams_to_numbers)) +print("Length of the dictionary of word representations:",len(mixed_to_numbers)) # FLAG - CHECK IF DICTIONARY IS BUILT CORRECTLY # SHOULD PUNCTUATION BE UNKNOWN? BECAUSE RIGHT NOW IT IS -NOPE, FIXED @@ -211,13 +212,13 @@ def create_vectors_list(sents, conversion_dict): # dev_vectors_list = create_vectors_list(sents_dev, words_to_numbers) # test_vectors_list = create_vectors_list(sents_test, words_to_numbers) -train_vectors_list = create_vectors_list(sents_train, bigrams_to_numbers) -dev_vectors_list = create_vectors_list(sents_dev, bigrams_to_numbers) -test_vectors_list = create_vectors_list(sents_test, bigrams_to_numbers) +# train_vectors_list = create_vectors_list(sents_train, bigrams_to_numbers) +# dev_vectors_list = create_vectors_list(sents_dev, bigrams_to_numbers) +# test_vectors_list = create_vectors_list(sents_test, bigrams_to_numbers) -# train_vectors_list = create_vectors_list(sents_train, mixed_to_numbers) -# dev_vectors_list = create_vectors_list(sents_dev, mixed_to_numbers) -# test_vectors_list = create_vectors_list(sents_test, mixed_to_numbers) +train_vectors_list = create_vectors_list(sents_train, mixed_to_numbers) +dev_vectors_list = create_vectors_list(sents_dev, mixed_to_numbers) +test_vectors_list = create_vectors_list(sents_test, mixed_to_numbers) # COUNT STATISTICS - HOW MANY WORDS WERE CONSIDERED UNK, AND HOW MANY OF EACH WORD @@ -227,13 +228,13 @@ def create_vectors_list(sents, conversion_dict): # dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, words_to_numbers) # test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, words_to_numbers) -train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, bigrams_to_numbers) -dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, bigrams_to_numbers) -test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, bigrams_to_numbers) +# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, bigrams_to_numbers) +# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, bigrams_to_numbers) +# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, bigrams_to_numbers) -# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, mixed_to_numbers) -# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, mixed_to_numbers) -# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, mixed_to_numbers) +train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, mixed_to_numbers) +dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, mixed_to_numbers) +test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, mixed_to_numbers) # FLAG - CHECK IF SPARSE MATRIX REPRESENTATION WAS DONE CORRECTLY @@ -286,8 +287,8 @@ def create_vectors_list(sents, conversion_dict): features = {} #for i,(token,value) in enumerate(zip(words_to_numbers, importances)): -#for i,(token,value) in enumerate(zip(mixed_to_numbers, importances)): -for i,(token,value) in enumerate(zip(bigrams_to_numbers, importances)): # IMPORTANTO TO CHANGE TO ADEQUATE DICT +for i,(token,value) in enumerate(zip(mixed_to_numbers, importances)): +#for i,(token,value) in enumerate(zip(bigrams_to_numbers, importances)): # IMPORTANTO TO CHANGE TO ADEQUATE DICT if (value != 0): features[token] = value features = sorted([(value, key) for (key, value) in features.items()], reverse=True) @@ -326,7 +327,7 @@ def create_vectors_list(sents, conversion_dict): os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, 'w') as file: #print("Performance measures - Unigram Dictionary - MLP Word Embeddings\n", file=file) - print("Performance measures - Bigram Dictionary - Adaboost\n", file=file) + print("Performance measures - Mixed Dictionary - Adaboost\n", file=file) #write_output_stats_file(path, "Dev", dev_labels_primary, predictions, labels) write_output_stats_file(path, "Test", test_labels_primary, predictions, labels) diff --git a/output/AI Classifier/1Label_confusion_matrix_NonNorm.png b/output/AI Classifier/1Label_confusion_matrix_NonNorm.png index 4d7a11e..2e94e14 100644 Binary files a/output/AI Classifier/1Label_confusion_matrix_NonNorm.png and b/output/AI Classifier/1Label_confusion_matrix_NonNorm.png differ diff --git a/output/AI Classifier/1Label_confusion_matrix_NormTrue.png b/output/AI Classifier/1Label_confusion_matrix_NormTrue.png index 75b2e46..75c4501 100644 Binary files a/output/AI Classifier/1Label_confusion_matrix_NormTrue.png and b/output/AI Classifier/1Label_confusion_matrix_NormTrue.png differ diff --git a/output/AI Classifier/1labelPredictionsStatsTest.txt b/output/AI Classifier/1labelPredictionsStatsTest.txt index 7f86527..c0b4e0c 100644 --- a/output/AI Classifier/1labelPredictionsStatsTest.txt +++ b/output/AI Classifier/1labelPredictionsStatsTest.txt @@ -1,14 +1,14 @@ -Performance measures - Bigram Dictionary - Adaboost +Performance measures - Mixed Dictionary - Adaboost Test set: -Precision macro: 0.275 -Precision Individually: [0.857 0. 0.517 0. 0. ] -Recall macro: 0.279 -Recall Individually: [0.429 0. 0.968 0. 0. ] -F1 Score micro: 0.537 -F1 Score macro: 0.249 -F1 Score weighted: 0.431 -F1 Score Individually: [0.571 0. 0.674 0. 0. ] +Precision macro: 0.425 +Precision Individually: [0.8 0. 0.574 0.25 0.5 ] +Recall macro: 0.315 +Recall Individually: [0.286 0. 1. 0.091 0.2 ] +F1 Score micro: 0.567 +F1 Score macro: 0.314 +F1 Score weighted: 0.49 +F1 Score Individually: [0.421 0. 0.729 0.133 0.286]