diff --git a/AIclassifier.py b/AIclassifier.py index 1d00ad5..cce93cf 100644 --- a/AIclassifier.py +++ b/AIclassifier.py @@ -121,8 +121,8 @@ def create_vectors_list(sents, conversion_dict): bigrams_vector.append(sent_bigrams_vector) mixed_vector.append(sent_mixed_vector) - return vectors_list # unigrams - #return bigrams_vector + #return vectors_list # unigrams + return bigrams_vector #return mixed_vector # def create_word_embedding(partition): @@ -195,8 +195,8 @@ def create_vectors_list(sents, conversion_dict): #train_word_embedding_features = create_word_embedding(sents_train) #dev_word_embedding_features = create_word_embedding(sents_dev) #test_word_embedding_features = numpy.asarray(create_word_embedding(sents_test)) -print("Length of the dictionary of word representations:",len(words_to_numbers)) -# print("Length of the dictionary of word representations:",len(bigrams_to_numbers)) +#print("Length of the dictionary of word representations:",len(words_to_numbers)) +print("Length of the dictionary of word representations:",len(bigrams_to_numbers)) #print("Length of the dictionary of word representations:",len(mixed_to_numbers)) # FLAG - CHECK IF DICTIONARY IS BUILT CORRECTLY @@ -205,13 +205,13 @@ def create_vectors_list(sents, conversion_dict): # count frequency before and after removing unknown words - ??? - ASK GABRIEL!! # checked that it seems ok -train_vectors_list = create_vectors_list(sents_train, words_to_numbers) -dev_vectors_list = create_vectors_list(sents_dev, words_to_numbers) -test_vectors_list = create_vectors_list(sents_test, words_to_numbers) +# train_vectors_list = create_vectors_list(sents_train, words_to_numbers) +# dev_vectors_list = create_vectors_list(sents_dev, words_to_numbers) +# test_vectors_list = create_vectors_list(sents_test, words_to_numbers) -# train_vectors_list = create_vectors_list(sents_train, bigrams_to_numbers) -# dev_vectors_list = create_vectors_list(sents_dev, bigrams_to_numbers) -# test_vectors_list = create_vectors_list(sents_test, bigrams_to_numbers) +train_vectors_list = create_vectors_list(sents_train, bigrams_to_numbers) +dev_vectors_list = create_vectors_list(sents_dev, bigrams_to_numbers) +test_vectors_list = create_vectors_list(sents_test, bigrams_to_numbers) # train_vectors_list = create_vectors_list(sents_train, mixed_to_numbers) # dev_vectors_list = create_vectors_list(sents_dev, mixed_to_numbers) @@ -221,13 +221,13 @@ def create_vectors_list(sents, conversion_dict): # FLAG - CHECK IF SENTENCE REPRESENTATIONS WERE DONE CORRECTLY -train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, words_to_numbers) -dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, words_to_numbers) -test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, words_to_numbers) +# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, words_to_numbers) +# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, words_to_numbers) +# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, words_to_numbers) -# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, bigrams_to_numbers) -# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, bigrams_to_numbers) -# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, bigrams_to_numbers) +train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, bigrams_to_numbers) +dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, bigrams_to_numbers) +test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, bigrams_to_numbers) # train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, mixed_to_numbers) # dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, mixed_to_numbers) @@ -283,9 +283,9 @@ def create_vectors_list(sents, conversion_dict): importances = model.feature_importances_ features = {} -for i,(token,value) in enumerate(zip(words_to_numbers, importances)): +#for i,(token,value) in enumerate(zip(words_to_numbers, importances)): #for i,(token,value) in enumerate(zip(mixed_to_numbers, importances)): -#for i,(token,value) in enumerate(zip(bigrams_to_numbers, importances)): # IMPORTANTO TO CHANGE TO ADEQUATE DICT +for i,(token,value) in enumerate(zip(bigrams_to_numbers, importances)): # IMPORTANTO TO CHANGE TO ADEQUATE DICT if (value != 0): features[token] = value features = sorted([(value, key) for (key, value) in features.items()], reverse=True) diff --git a/output/AI Classifier/1Label_confusion_matrix_NonNorm.png b/output/AI Classifier/1Label_confusion_matrix_NonNorm.png index 6136d03..337cc7e 100644 Binary files a/output/AI Classifier/1Label_confusion_matrix_NonNorm.png and b/output/AI Classifier/1Label_confusion_matrix_NonNorm.png differ diff --git a/output/AI Classifier/1Label_confusion_matrix_NormTrue.png b/output/AI Classifier/1Label_confusion_matrix_NormTrue.png index a0305bb..dfe3cbb 100644 Binary files a/output/AI Classifier/1Label_confusion_matrix_NormTrue.png and b/output/AI Classifier/1Label_confusion_matrix_NormTrue.png differ diff --git a/output/AI Classifier/1labelPredictionsStatsTest.txt b/output/AI Classifier/1labelPredictionsStatsTest.txt index 605787c..4c8e323 100644 --- a/output/AI Classifier/1labelPredictionsStatsTest.txt +++ b/output/AI Classifier/1labelPredictionsStatsTest.txt @@ -2,13 +2,13 @@ Performance measures - Unigram Dictionary - Adaboost Test set: -Precision macro: 0.254 -Precision Individually: [0.357 0. 0.59 0.125 0.2 ] -Recall macro: 0.258 -Recall Individually: [0.357 0. 0.742 0.091 0.1 ] -F1 Score micro: 0.448 -F1 Score macro: 0.251 -F1 Score weighted: 0.416 -F1 Score Individually: [0.357 0. 0.657 0.105 0.133] +Precision macro: 0.257 +Precision Individually: [0.8 0. 0.484 0. 0. ] +Recall macro: 0.251 +Recall Individually: [0.286 0. 0.968 0. 0. ] +F1 Score micro: 0.507 +F1 Score macro: 0.213 +F1 Score weighted: 0.386 +F1 Score Individually: [0.421 0. 0.645 0. 0. ]