diff --git a/AIclassifier.py b/AIclassifier.py index b4f0734..4d174b3 100644 --- a/AIclassifier.py +++ b/AIclassifier.py @@ -58,7 +58,8 @@ def format_sentVector_to_SparseMatrix(vectors_list, dictionary): # Create sentences representation in numeric format, according to dictionary def create_vectors_list(sents, conversion_dict): - unk_count = 0 + unk_unigrams_count = 0 + unk_bigrams_count = 0 unigrams_vector = [] bigrams_vector = [] mixed_vector = [] @@ -81,7 +82,7 @@ def create_vectors_list(sents, conversion_dict): if token.lower() not in conversion_dict: #sent_tokens_list.append("unk") # TO CONSIDER UNK TOKENS, UNCOMMENT THESE LINES #mixed_tokens_list.append("unk") - #unk_count += 1 + unk_unigrams_count += 1 pass else: sent_tokens_list.append(token.lower()) @@ -96,7 +97,7 @@ def create_vectors_list(sents, conversion_dict): for bigram in sent_bigram: if bigram not in conversion_dict: #sent_bigrams_list.append("unk") TO CONSIDER UNK TOKENS, UNCOMMENT THESE LINES - #unk_count += 1 + unk_bigrams_count += 1 pass else: sent_bigrams_list.append(bigram) @@ -111,9 +112,11 @@ def create_vectors_list(sents, conversion_dict): bigrams_vector.append(sent_bigrams_vector) mixed_vector.append(sent_mixed_vector) - return unigrams_vector # TO RUN WITH UNIGRAMS, UNCOMMENT THIS LINE AND COMMENT THE OTHER TWO RETURNS + print("Unigrams unknown count including repetitions:", unk_unigrams_count) + print("Bigrams unknown count including repetitions:", unk_bigrams_count, "\n") + #return unigrams_vector # TO RUN WITH UNIGRAMS, UNCOMMENT THIS LINE AND COMMENT THE OTHER TWO RETURNS #return bigrams_vector # TO RUN WITH BIGRAMS, UNCOMMENT THIS LINE AND COMMENT THE OTHER TWO RETURNS - #return mixed_vector # TO RUN WITH UNIGRAMS + BIGRAMS, UNCOMMENT THIS LINE AND COMMENT THE OTHER TWO RETURNS + return mixed_vector # TO RUN WITH UNIGRAMS + BIGRAMS, UNCOMMENT THIS LINE AND COMMENT THE OTHER TWO RETURNS # TO USE MLP CLASSIFIER WITH WORD EMBEDDINGS APPROACH, UNCOMMENT THIS FUNCION # def create_word_embedding(partition): @@ -149,43 +152,45 @@ def create_vectors_list(sents, conversion_dict): corpus = clean_corpus(corpus) train_doc = nlp(corpus) train_doc = reconstruct_hyphenated_words(train_doc) -tokens = [token.text for token in train_doc if not token.is_space if not token.is_punct] # if not token.text in stopwords.words()] +corpus_in_unigrams = [token.text for token in train_doc if not token.is_space if not token.is_punct] # if not token.text in stopwords.words()] # OBS: MAYBE ENHANCING PREPROCESSING BY REMOVING LITTLE SQUARES COULD BE AN OPTION corpus_in_bigrams = [] -for i in range(0,len(tokens)-1): - corpus_in_bigrams.append(tokens[i]+" "+tokens[i+1]) +for i in range(0,len(corpus_in_unigrams)-1): + corpus_in_bigrams.append(corpus_in_unigrams[i]+" "+corpus_in_unigrams[i+1]) -token_freq = Counter(tokens) +unigram_freq = Counter(corpus_in_unigrams) bigram_freq = Counter(corpus_in_bigrams) -print("Unigrams frequency before removing unknown words:", token_freq) -print("Bigrams frequency before removing unknown words:", bigram_freq) +# print("Unigrams frequency before removing unknown words:", unigram_freq) +# print("Bigrams frequency before removing unknown words:", bigram_freq) -# Removing words less frequent than 2 -corpus_without_unk = [token[0] for token in token_freq.items() if int(token[1]) > 2] -bigrams_filtered_lexicon = [bigram[0] for bigram in bigram_freq.items() if int(bigram[1]) > 1] +# Removing less frequent than 2 +unigrams_filtered_lexicon = [unigram[0] for unigram in unigram_freq.items() if int(unigram[1]) > 2] +bigrams_filtered_lexicon = [bigram[0] for bigram in bigram_freq.items() if int(bigram[1]) > 1] +# print("Unigrams frequency after removing unknown words:", [unigram for unigram in unigram_freq.items() if int(unigram[1]) > 2]) +# print("Bigrams frequency after removing unknown words:", [bigram for bigram in bigram_freq.items() if int(bigram[1]) > 1] ) -token_freq = Counter(corpus_without_unk) -bigram_freq = Counter(bigrams_filtered_lexicon) -print("Unigrams frequency after removing unknown words:", token_freq) -print("Bigrams frequency after removing unknown words:", bigram_freq) +# Counting unknown tokens +unknown_unigrams = [unigram[0] for unigram in unigram_freq.items() if int(unigram[1]) <= 2] +unknown_bigrams = [bigram[0] for bigram in bigram_freq.items() if int(bigram[1]) <= 1] +print("\n","Unknown unigrams count without repetitions:", len(unknown_unigrams)) +print("Unknown bigrams count without repetitions:", len(unknown_bigrams), "\n") # Unigram dictionary -unigrams_to_numbers = create_dict(corpus_without_unk) +unigrams_to_numbers = create_dict(unigrams_filtered_lexicon) # Bigram dictionary bigrams_to_numbers = create_dict(bigrams_filtered_lexicon) # Mixed dictionary -with open('featureslr0.5nEst100.txt', 'r') as file: -#with open('features.txt', 'r') as file: +with open('features.txt', 'r') as file: features_list = file.read() features_list = features_list.split('\n') mixed_to_numbers = create_dict(features_list) -print("Length of the dictionary of unigrams:",len(unigrams_to_numbers)) -print("Length of the dictionary of bigrams:",len(bigrams_to_numbers)) -print("Length of the dictionary of unigrams and bigrams:",len(mixed_to_numbers)) +print("Length of the dictionary of unigrams(lexicon):",len(unigrams_to_numbers)) +print("Length of the dictionary of bigrams(lexicon):",len(bigrams_to_numbers)) +print("Length of the dictionary of unigrams and bigrams(lexicon):",len(mixed_to_numbers), "\n") # CREATE SENTENCE REPRESENTATIONS # can either be by word embeddings or with a simple representation according to the presence of a unigram or bigram in the sentence @@ -202,9 +207,9 @@ def create_vectors_list(sents, conversion_dict): # SIMPLE NUMERICAL REPRESENTATIONS OF THE SENTENCES # TO RUN WITH UNIGRAMS, UNCOMMENT THIS 3 LINES AND COMMENT THE OTHER TWO TRIPLETS -train_vectors_list = create_vectors_list(sents_train, unigrams_to_numbers) -dev_vectors_list = create_vectors_list(sents_dev, unigrams_to_numbers) -test_vectors_list = create_vectors_list(sents_test, unigrams_to_numbers) +# train_vectors_list = create_vectors_list(sents_train, unigrams_to_numbers) +# dev_vectors_list = create_vectors_list(sents_dev, unigrams_to_numbers) +# test_vectors_list = create_vectors_list(sents_test, unigrams_to_numbers) # TO RUN WITH BIGRAMS, UNCOMMENT THIS 3 LINES AND COMMENT THE OTHER TWO TRIPLETS # train_vectors_list = create_vectors_list(sents_train, bigrams_to_numbers) @@ -212,16 +217,16 @@ def create_vectors_list(sents, conversion_dict): # test_vectors_list = create_vectors_list(sents_test, bigrams_to_numbers) # TO RUN WITH UNIGRAMS + BIGRAMS, UNCOMMENT THIS 3 LINES AND COMMENT THE OTHER TWO TRIPLETS -# train_vectors_list = create_vectors_list(sents_train, mixed_to_numbers) -# dev_vectors_list = create_vectors_list(sents_dev, mixed_to_numbers) -# test_vectors_list = create_vectors_list(sents_test, mixed_to_numbers) +train_vectors_list = create_vectors_list(sents_train, mixed_to_numbers) +dev_vectors_list = create_vectors_list(sents_dev, mixed_to_numbers) +test_vectors_list = create_vectors_list(sents_test, mixed_to_numbers) # FORMATTING SIMPLE SENTENCE REPRESENTATIONS - MUST BE IN SPARSE MATRIX FORMAT TO FEED THE CLASSIFIERS # TO RUN WITH UNIGRAMS, UNCOMMENT THIS 3 LINES AND COMMENT THE OTHER TWO TRIPLETS -train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, unigrams_to_numbers) -dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, unigrams_to_numbers) -test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, unigrams_to_numbers) +# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, unigrams_to_numbers) +# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, unigrams_to_numbers) +# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, unigrams_to_numbers) # TO RUN WITH BIGRAMS, UNCOMMENT THIS 3 LINES AND COMMENT THE OTHER TWO TRIPLETS # train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, bigrams_to_numbers) @@ -229,9 +234,9 @@ def create_vectors_list(sents, conversion_dict): # test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, bigrams_to_numbers) # TO RUN WITH UNIGRAMS + BIGRAMS, UNCOMMENT THIS 3 LINES AND COMMENT THE OTHER TWO TRIPLETS -# train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, mixed_to_numbers) -# dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, mixed_to_numbers) -# test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, mixed_to_numbers) +train_matrix_array = format_sentVector_to_SparseMatrix(train_vectors_list, mixed_to_numbers) +dev_matrix_array = format_sentVector_to_SparseMatrix(dev_vectors_list, mixed_to_numbers) +test_matrix_array = format_sentVector_to_SparseMatrix(test_vectors_list, mixed_to_numbers) # CREATE LABELS REPRESENTATIONS @@ -264,8 +269,10 @@ def create_vectors_list(sents, conversion_dict): # Classifier models -# TO USE ADABOOST CLASSIFIER, UNCOMMENT THIS LINE AND COMMENT OTHER MODELS -adaclassifier = AdaBoostClassifier(n_estimators=100, learning_rate=0.5) +# TO USE ADABOOST CLASSIFIER, UNCOMMENT adaclassifier AND COMMENT OTHER MODELS +# TO USE UNIGRAMS, PARAMETERS WERE BETTER WITH n_estimators=50, learning_rate=1 +# TO USE UNIGRAMS + BIGRAMS, PARAMETERS WERE BETTER WITH n_estimators=100, learning_rate=0.5 +adaclassifier = AdaBoostClassifier(n_estimators=100, learning_rate=0.5) # TO USE SVC CLASSIFIER WITH ONE VS REST SCHEME, UNCOMMENT THIS LINE AND COMMENT OTHER MODELS #svc_classifier = make_pipeline(StandardScaler(), OneVsRestClassifier(LinearSVC(dual=False,random_state=None, tol=1e-5, C=1))) # TO USE SVC CLASSIFIER WITH ONE VS ONE SCHEME, UNCOMMENT THIS LINE AND COMMENT OTHER MODELS @@ -293,18 +300,17 @@ def create_vectors_list(sents, conversion_dict): #print(model.best_params_) # TO SEE WHICH FEATURES ADABOOST CHOSE, UNCOMMENT THIS SECTION -importances = model.feature_importances_ -features = {} - +# importances = model.feature_importances_ +# features = {} # UNCOMMENT THE LINE YOU NEED FROM THESE 3 AND COMMENT THE OTHER 2 -#for i,(token,value) in enumerate(zip(unigrams_to_numbers, importances)): -for i,(token,value) in enumerate(zip(mixed_to_numbers, importances)): -#for i,(token,value) in enumerate(zip(bigrams_to_numbers, importances)): - if (value != 0): - features[token] = value -features = sorted([(value, key) for (key, value) in features.items()], reverse=True) -for feature in features: - print('Feature:',feature[1],'Score:',feature[0]) +# for i,(token,value) in enumerate(zip(unigrams_to_numbers, importances)): +# # for i,(token,value) in enumerate(zip(mixed_to_numbers, importances)): +# #for i,(token,value) in enumerate(zip(bigrams_to_numbers, importances)): +# if (value != 0): +# features[token] = value +# features = sorted([(value, key) for (key, value) in features.items()], reverse=True) +# for feature in features: +# print('Feature:',feature[1],'Score:',feature[0]) # Predicting diff --git a/Oldfeatures.txt b/Oldfeatures.txt deleted file mode 100644 index b8c93a6..0000000 --- a/Oldfeatures.txt +++ /dev/null @@ -1,74 +0,0 @@ -privacy -people -system -safe -possible -should -experience -safer -right -secure -community -believe -express -we -can -use -cookies -you -data -or -content -with -information -they -learn -more -provide -device -understand -days -collect -who -the -your -is -to be -you should -help keep -provide you -should be -and protect -when people -to enable -and privacy -this right -public interest -what you -have a -you with -for people -use cookies -you can -20/09/2021 14 -more about -help us -the information -information about -we may -and others -products including -not about -these terms -to build -with facebook -who you -advertising and -that help -we collect -share and -not at -services that -share it -it with -are not \ No newline at end of file diff --git a/aux.txt b/aux.txt deleted file mode 100644 index 0e92d07..0000000 --- a/aux.txt +++ /dev/null @@ -1,129 +0,0 @@ -we -cookies -you -the -right -your -data -or -is -information -privacy -they -learn -people -safe -secure -community -possible -should -believe -express -use cookies -things like -for example -cookies to -these terms -service providers -public interest -of your -learn more -law enforcement -how we -can use -by using -▪ internet -िह � -your data -while participating -websites apps -web browser -video calling -united states -tool allows -to prevent -this includes -they even -the same -that a -technical limitations -similar technologies -sign up -share content -safety integrity -prior permission -pixel tags -pay us -pages videos -p r -other 1 -operating system -no longer -n g -messages restricted -local fundraisers -lite watch -like give -legitimate interests -italiano română -ireland ltd -ios 13 -https //about -help center -have about -good-faith belief -globally both -face recognition -express themselves -e l -different devices -covid-19 support -consistent experience -conducting surveys -competent court -camera so -brand resources -best practices -automatically process -assets changes -as described -are performing -apply when -ai ethics -advertisers who -active status -a b -23 june -20/09/2021 14 -//opensource fb -'re visiting - - - -Feature: we Score: 0.01 -Feature: use Score: 0.01 -Feature: cookies Score: 0.01 -Feature: you Score: 0.01 -Feature: data Score: 0.01 -Feature: or Score: 0.02 -Feature: content Score: 0.01 -Feature: with Score: 0.01 -Feature: information Score: 0.01 -Feature: privacy Score: 0.13 -Feature: they Score: 0.01 -Feature: learn Score: 0.01 -Feature: more Score: 0.01 -Feature: provide Score: 0.01 -Feature: people Score: 0.14 -Feature: device Score: 0.01 -Feature: system Score: 0.1 -Feature: safe Score: 0.11 -Feature: secure Score: 0.01 -Feature: understand Score: 0.01 -Feature: who Score: 0.02 -Feature: possible Score: 0.05 -Feature: days Score: 0.01 -Feature: should Score: 0.12 -Feature: experience Score: 0.05 -Feature: collect Score: 0.01 -Feature: safer Score: 0.08 \ No newline at end of file diff --git a/features.txt b/features.txt index 80f5f1c..8c09f14 100644 --- a/features.txt +++ b/features.txt @@ -1,37 +1,25 @@ -combat -you -right -your -is -be privacy -give -people -their -safe secure community -however -possible should -safer -believe +possible express -experience -can -no -connect -they -public -example -cookies -content -and +believe +right +people +your +you we +they the +safe or +is information data +cookies +no +experience use protect with @@ -42,24 +30,23 @@ if device days companies +can automatically also about you should -this right -should be -not at -help keep -and protect -have a to be when people +and protect public interest for people +have a you share to enable +this right provide you it with +help keep are not advertising and you with @@ -78,10 +65,12 @@ share it share and products including post or +not at more about information about help us cookies to and privacy and others -ads to \ No newline at end of file +ads to +should be \ No newline at end of file diff --git a/featureslr0.5nEst100.txt b/featureslr0.5nEst100.txt deleted file mode 100644 index 8c09f14..0000000 --- a/featureslr0.5nEst100.txt +++ /dev/null @@ -1,76 +0,0 @@ -privacy -secure -community -should -possible -express -believe -right -people -your -you -we -they -the -safe -or -is -information -data -cookies -no -experience -use -protect -with -providers -limited -learn -if -device -days -companies -can -automatically -also -about -you should -to be -when people -and protect -public interest -for people -have a -you share -to enable -this right -provide you -it with -help keep -are not -advertising and -you with -with the -you use -you can -with facebook -we may -we collect -we also -the information -the facebook -that help -such as -share it -share and -products including -post or -not at -more about -information about -help us -cookies to -and privacy -and others -ads to -should be \ No newline at end of file diff --git a/featureslr1nEst50.txt b/featureslr1nEst50.txt deleted file mode 100644 index b5946b4..0000000 --- a/featureslr1nEst50.txt +++ /dev/null @@ -1,43 +0,0 @@ -combat -you -right -your -is -be -privacy -give -people -their -safe -secure -community -however -possible -should -safer -believe -express -experience -can -no -connect -they -public -example -cookies -content -and -to be -you should -to enable -this right -should be -not at -help keep -when people -provide you -and protect -you can -have a -you with -with the \ No newline at end of file diff --git a/output/AI Classifier/1Label_confusion_matrix_NonNorm.png b/output/AI Classifier/1Label_confusion_matrix_NonNorm.png index e1b9bba..2e94e14 100644 Binary files a/output/AI Classifier/1Label_confusion_matrix_NonNorm.png and b/output/AI Classifier/1Label_confusion_matrix_NonNorm.png differ diff --git a/output/AI Classifier/1Label_confusion_matrix_NormTrue.png b/output/AI Classifier/1Label_confusion_matrix_NormTrue.png index 4cac149..75c4501 100644 Binary files a/output/AI Classifier/1Label_confusion_matrix_NormTrue.png and b/output/AI Classifier/1Label_confusion_matrix_NormTrue.png differ