From 86487e0ea6d1b0bb6cec0218875ce2b63efac41c Mon Sep 17 00:00:00 2001 From: gicraveiro Date: Tue, 8 Feb 2022 15:54:29 +0100 Subject: [PATCH] changed configurations to run mlp classifier --- AIclassifier.py | 111 ++++++++++++++++++++++++------------------------ 1 file changed, 55 insertions(+), 56 deletions(-) diff --git a/AIclassifier.py b/AIclassifier.py index 842e41f..3496d24 100644 --- a/AIclassifier.py +++ b/AIclassifier.py @@ -137,27 +137,27 @@ def create_vectors_list(sents, conversion_dict): #return mixed_vector # TO RUN WITH UNIGRAMS + BIGRAMS, UNCOMMENT THIS LINE AND COMMENT THE OTHER TWO RETURNS # TO USE MLP CLASSIFIER WITH WORD EMBEDDINGS APPROACH, UNCOMMENT THIS FUNCION -# def create_word_embedding(partition): -# word_embedding_features = [] -# for sent in partition: -# sent_doc = clean_corpus(sent) -# sent_doc = nlp(sent_doc) -# sent_doc = reconstruct_hyphenated_words(sent_doc) -# sent_doc = [token.text for token in sent_doc if not token.is_space if not token.is_punct] -# sentence_embedding = [] -# for token in sent_doc: -# token_word_embedding = ft.get_word_vector(token) -# sentence_embedding.append(token_word_embedding) -# we_mean = numpy.asarray(sentence_embedding).mean(axis=0) -# #if isinstance(we_mean, float): -# # we_mean = numpy.zeros(300, dtype=float) -# word_embedding_features.append(we_mean) -# #word_embedding_features = numpy.asarray(word_embedding_features) -# #word_embedding_features = numpy.append(word_embedding_features, we_mean) -# #tokens_list_of_lists.append(sent_doc) -# #word_embedding_features = numpy.asarray(word_embedding_features) -# word_embedding_features = word_embedding_features -# return word_embedding_features +def create_word_embedding(partition): + word_embedding_features = [] + for sent in partition: + sent_doc = clean_corpus(sent) + sent_doc = nlp(sent_doc) + sent_doc = reconstruct_hyphenated_words(sent_doc) + sent_doc = [token.text for token in sent_doc if not token.is_space if not token.is_punct] + sentence_embedding = [] + for token in sent_doc: + token_word_embedding = ft.get_word_vector(token) + sentence_embedding.append(token_word_embedding) + we_mean = numpy.asarray(sentence_embedding).mean(axis=0) + #if isinstance(we_mean, float): + # we_mean = numpy.zeros(300, dtype=float) + word_embedding_features.append(we_mean) + #word_embedding_features = numpy.asarray(word_embedding_features) + #word_embedding_features = numpy.append(word_embedding_features, we_mean) + #tokens_list_of_lists.append(sent_doc) + #word_embedding_features = numpy.asarray(word_embedding_features) + word_embedding_features = word_embedding_features + return word_embedding_features #### # MAIN @@ -217,11 +217,11 @@ def create_vectors_list(sents, conversion_dict): # WORD EMBEDDINGS # TO USE MLP CLASSIFIER WITH WORD EMBEDDINGS APPROACH, UNCOMMENT THIS LINE -#ft = fasttext.load_model('cc.en.300.bin') +ft = fasttext.load_model('cc.en.300.bin') -#train_word_embedding_features = create_word_embedding(sents_train) -#dev_word_embedding_features = create_word_embedding(sents_dev) -#test_word_embedding_features = numpy.asarray(create_word_embedding(sents_test)) +train_word_embedding_features = create_word_embedding(sents_train) +dev_word_embedding_features = create_word_embedding(sents_dev) +test_word_embedding_features = numpy.asarray(create_word_embedding(sents_test)) # SIMPLE NUMERICAL REPRESENTATIONS OF THE SENTENCES @@ -293,11 +293,11 @@ def create_vectors_list(sents, conversion_dict): # TO USE UNIGRAMS + BIGRAMS, PARAMETERS WERE BETTER WITH n_estimators=100, learning_rate=0.5 # adaclassifier = AdaBoostClassifier(n_estimators=50, learning_rate=1) # TO USE SVC CLASSIFIER WITH ONE VS REST SCHEME, UNCOMMENT THIS LINE AND COMMENT OTHER MODELS -svc_classifier = make_pipeline(StandardScaler(), OneVsRestClassifier(LinearSVC(dual=False,random_state=None, tol=1e-5, C=0.05))) +# svc_classifier = make_pipeline(StandardScaler(), OneVsRestClassifier(LinearSVC(dual=False,random_state=None, tol=1e-5, C=0.05))) # TO USE SVC CLASSIFIER WITH ONE VS ONE SCHEME, UNCOMMENT THIS LINE AND COMMENT OTHER MODELS # svc_classifier = make_pipeline(StandardScaler(), OneVsOneClassifier(LinearSVC(dual=False,random_state=None, tol=1e-5, C=0.05))) # TO USE MLP CLASSIFIER WITH WORD EMBEDDINGS, UNCOMMENT THIS LINE AND COMMENT OTHER MODELS -#mlp_classifier = MLPClassifier(random_state=1111111, early_stopping=True, batch_size=32, hidden_layer_sizes=(200,250,200), learning_rate='adaptive', learning_rate_init=0.001, max_iter=1000) +mlp_classifier = MLPClassifier(random_state=1111111, early_stopping=True, batch_size=32, hidden_layer_sizes=(200,250,200), learning_rate='adaptive', learning_rate_init=0.001, max_iter=1000) # TO TEST HYPERPARAMETERS FOR MLP CLASSIFIER, UNCOMMENT THIS LINE AND COMMENT OTHER MODELS #opt_mlp = GridSearchCV(mlp_classifier, parameter_space, n_jobs=-1, cv=10) @@ -307,12 +307,12 @@ def create_vectors_list(sents, conversion_dict): # model = adaclassifier.fit(train_matrix_array, train_labels_primary) #print(adaclassifier.best_params_) # prints best parameters if you enabled GridSearchCV # TO USE SVC CLASSIFIER, UNCOMMENT THIS LINE AND COMMENT OTHER MODELS -model = svc_classifier.fit(train_matrix_array, train_labels_primary) +# model = svc_classifier.fit(train_matrix_array, train_labels_primary) # TO USE MLP CLASSIFIER, UNCOMMENT THESE 3 LINES AND COMMENT OTHER MODELS -#new_train_features = numpy.asarray(train_word_embedding_features + dev_word_embedding_features) -#new_train_labels = numpy.asarray(train_labels_primary + dev_labels_primary) -#model = mlp_classifier.fit(new_train_features, new_train_labels) +new_train_features = numpy.asarray(train_word_embedding_features + dev_word_embedding_features) +new_train_labels = numpy.asarray(train_labels_primary + dev_labels_primary) +model = mlp_classifier.fit(new_train_features, new_train_labels) # TO TEST PARAMETERS FOR MLP CLASSIFIER UNCOMMENT THESE 2 LINES AND COMMENT OTHER MODELS #model = opt_mlp.fit(new_train_features, new_train_labels) @@ -334,12 +334,11 @@ def create_vectors_list(sents, conversion_dict): # Predicting # TO USE ADABOOST OR SVC CLASSIFIERS -predictions = model.predict(dev_matrix_array) # DEV +#predictions = model.predict(dev_matrix_array) # DEV #predictions = model.predict(test_matrix_array) # TEST # TO USE MLP CLASSIFIER WITH WORD EMBEDDINGS -#predictions = model.predict(dev_word_embedding_features) # DEV -#predictions = model.predict(test_word_embedding_features) # TEST +predictions = model.predict(test_word_embedding_features) # TEST # Output of results @@ -353,46 +352,46 @@ def create_vectors_list(sents, conversion_dict): # path = 'output/AI Classifier/Adaboost/Unigrams+Bigrams/' # path = 'output/AI Classifier/SVC/One vs Rest/C=1/' # path = 'output/AI Classifier/SVC/One vs One/C=1/' -path = 'output/AI Classifier/SVC/One vs Rest/C=0.05/' +# path = 'output/AI Classifier/SVC/One vs Rest/C=0.05/' # path = 'output/AI Classifier/SVC/One vs One/C=0.05/' # path = 'output/AI Classifier/SVC/C=0.05/' -# path = 'output/AI Classifier/MLP/' +path = 'output/AI Classifier/MLP/' -aux_path= path + 'confusion_matrix_Normalized_Dev.png' -#aux_path= path + 'confusion_matrix_Normalized_Test.png' +# aux_path= path + 'confusion_matrix_Normalized_Dev.png' +aux_path= path + 'confusion_matrix_Normalized_Test.png' display_labels=['Commit to privacy', 'Declare opinion about privacy','Not applicable','Related to privacy','Violate privacy'] # NORMALIZED CONFUSION MATRIX -create_confusion_matrix(dev_list, pred_list, "true", aux_path, labels, display_labels) # DEV -#create_confusion_matrix(test_list, pred_list, "true", aux_path, labels, display_labels) # TEST +#create_confusion_matrix(dev_list, pred_list, "true", aux_path, labels, display_labels) # DEV +create_confusion_matrix(test_list, pred_list, "true", aux_path, labels, display_labels) # TEST # NON NORMALIZED CONFUSION MATRIX -aux_path= path + 'confusion_matrix_NonNormalized_Dev.png' -#aux_path= path + 'confusion_matrix_NonNormalized_Test.png' -create_confusion_matrix(dev_list, pred_list, None, aux_path, labels, display_labels) # DEV -#create_confusion_matrix(test_list, pred_list, None, aux_path, labels, display_labels) # TEST +#aux_path= path + 'confusion_matrix_NonNormalized_Dev.png' +aux_path= path + 'confusion_matrix_NonNormalized_Test.png' +#create_confusion_matrix(dev_list, pred_list, None, aux_path, labels, display_labels) # DEV +create_confusion_matrix(test_list, pred_list, None, aux_path, labels, display_labels) # TEST # File for performance on predictions -aux_path = path + 'PredictionsStatsDev.txt' # DEV -#aux_path = path + 'PredictionsStatsTest.txt' # TEST +#aux_path = path + 'PredictionsStatsDev.txt' # DEV +aux_path = path + 'PredictionsStatsTest.txt' # TEST with open(aux_path, 'w') as file: - #print("Performance measures - MLP Word Embeddings\n", file=file) # CHANGE TITLE ACCORDING TO CONTEXT - print("Performance measures - SVC\n0ne vs Rest, C = 0.05\n", file=file) # CHANGE TITLE ACCORDING TO CONTEXT -write_output_stats_file(aux_path, "Dev", dev_labels_primary, predictions, labels) # DEV -#write_output_stats_file(aux_path, "Test", test_labels_primary, predictions, labels) # TEST + print("Performance measures - MLP Word Embeddings\n", file=file) # CHANGE TITLE ACCORDING TO CONTEXT + #print("Performance measures - SVC\n0ne vs Rest, C = 0.05\n", file=file) # CHANGE TITLE ACCORDING TO CONTEXT +#write_output_stats_file(aux_path, "Dev", dev_labels_primary, predictions, labels) # DEV +write_output_stats_file(aux_path, "Test", test_labels_primary, predictions, labels) # TEST pred_list = converts_to_text(pred_list) -dev_pred_dict = create_sent_label_dict(sents_dev, pred_list) # DEV -#test_pred_dict = create_sent_label_dict(sents_test, pred_list) # TEST +#dev_pred_dict = create_sent_label_dict(sents_dev, pred_list) # DEV +test_pred_dict = create_sent_label_dict(sents_test, pred_list) # TEST # Predictions json file -aux_path = path + 'Predictions_Dev.json' -#aux_path = path + 'Predictions_Test.json' +#aux_path = path + 'Predictions_Dev.json' +aux_path = path + 'Predictions_Test.json' -write_predictions_file(dev_pred_dict, aux_path) # DEV -#write_predictions_file(test_pred_dict, aux_path) # TEST +#write_predictions_file(dev_pred_dict, aux_path) # DEV +write_predictions_file(test_pred_dict, aux_path) # TEST # References that helped me understand how to implement all this # https://newbedev.com/valueerror-could-not-broadcast-input-array-from-shape-2242243-into-shape-224224