The initial version of the Topic Modeling modules (v0.1.0), contribut…

…ed by Hatma Suryotrisongko, consists of: - maryam/core/util/iris/topic.py - maryam/modules/iris/topicmodeling.py
saeeddhqan · Jul 8, 2022 · 314a2fb · 314a2fb
1 parent 5f2779b
commit 314a2fb
Show file tree

Hide file tree

Showing 2 changed files with 210 additions and 0 deletions.
diff --git a/maryam/core/util/iris/topic.py b/maryam/core/util/iris/topic.py
@@ -0,0 +1,173 @@
+# core/util/iris/topicmodeling.py
+# Based on Hatma Suryotrisongko's prototype = https://github.com/keamanansiber/Maryam/blob/master/notebooks/Prototype_4_TopicModeling_0_1_0_CsvFile_Options_StopwordsRemoval_27062022.ipynb
+
+import pandas as pd
+import numpy as np
+import json
+import csv
+from dask import dataframe as dd
+
+from sklearn.cluster import KMeans
+import scipy
+import matplotlib.pyplot as plt
+import umap
+
+from bertopic import BERTopic
+from sentence_transformers import SentenceTransformer
+
+from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
+
+
+class main:
+
+    def __init__(self, inputfile, filetype, showcharts, verbose):
+
+        if verbose == True:
+            print("\n\n DATASET = reading file : " + inputfile)
+
+        if filetype == "csv":
+            # tmp = pd.read_csv(inputfile, header=None, low_memory=False)
+            tmp = dd.read_csv(inputfile, sep=';', header=None)
+            tmp2 = tmp.to_dask_array(lengths=True)
+            tmp3 = tmp2.compute()
+            tmp4 = pd.DataFrame(tmp3)
+
+            if verbose == True:
+                print("\n\n csv file (before preprocessing) = ")
+                print(tmp4)
+
+            self.corpus = tmp4[0].str.lower().apply(remove_stopwords).to_numpy()
+
+        elif filetype == "json":
+            with open(inputfile) as json_file:
+                jsonfile = json.load(json_file)
+
+            tmp = pd.DataFrame(jsonfile['results'])
+
+            if verbose == True:
+                print("\n\n json file (before preprocessing) = ")
+                print(tmp)
+
+            tmp['td'] = tmp['t'] + ' ' + tmp['d']
+            self.corpus = tmp['td'].str.lower().apply(remove_stopwords).to_numpy()
+
+        else:
+            print("ERROR, only accept csv or json file!")
+
+        if verbose == True:
+            print("\n\n number of corpus = ")
+            print(len(self.corpus))
+            print("\n\n self.corpus[0] = ")
+            print(self.corpus[0])
+            print("\n\n all self.corpus = ")
+            print(self.corpus)
+
+        if showcharts == True:
+            print("\n\n histogram of the number of words in each corpus")
+            pd.Series([len(e.split()) for e in self.corpus]).hist()
+            plt.show()
+
+    def run_sklearn_cluster_kmeans(self, selected_pretrained_model, showcharts, verbose):
+
+        pretrained_model = selected_pretrained_model
+        if verbose == True:
+            print("\n\n Model selection")
+            # https://www.sbert.net/docs/pretrained_models.html
+            print(pretrained_model)
+
+        model = SentenceTransformer(pretrained_model)
+        if verbose == True:
+            print(model)
+
+        corpus_embeddings = model.encode(self.corpus)
+        if verbose == True:
+            print("\n\n CORPUS EMBEDDING")
+            print(corpus_embeddings.shape)
+            print(corpus_embeddings)
+
+        K = 5
+        kmeans = KMeans(n_clusters=5, random_state=0).fit(corpus_embeddings)
+        if verbose == True:
+            print("\n\n Show Cluster using SkLearn KMeans")
+            print(kmeans)
+
+        corpus_labeled = pd.DataFrame({'ClusterLabel': kmeans.labels_, 'Sentence': self.corpus})
+        print("\n\n corpus_labeled = ")
+        print(corpus_labeled)
+
+        cls_dist = pd.Series(kmeans.labels_).value_counts()
+        if verbose == True:
+            print("\n\n frequency of cluster label = ")
+            print(cls_dist)
+
+        distances = scipy.spatial.distance.cdist(kmeans.cluster_centers_, corpus_embeddings)
+        if verbose == True:
+            print("\n\n calculate distance of cluster's center point = ")
+            print(distances)
+
+        print("\n\n Cluster's center example = ")
+
+        centers = {}
+        print("Cluster", "Size", "Center-idx", "Center-Example", sep="\t\t")
+        for i, d in enumerate(distances):
+            ind = np.argsort(d, axis=0)[0]
+            centers[i] = ind
+            print(i, cls_dist[i], ind, self.corpus[ind], sep="\t\t")
+
+        if showcharts == True:
+            print("\n\n Visualization of the cluster points")
+
+            X = umap.UMAP(n_components=2, min_dist=0.0).fit_transform(corpus_embeddings)
+            labels = kmeans.labels_
+
+            fig, ax = plt.subplots(figsize=(12, 8))
+            plt.scatter(X[:, 0], X[:, 1], c=labels, s=1, cmap='Paired')
+            for c in centers:
+                plt.text(X[centers[c], 0], X[centers[c], 1], "CLS-" + str(c), fontsize=24)
+            plt.colorbar()
+            plt.show()
+
+    def run_topic_modeling_bertopic(self, selected_pretrained_model, verbose):
+
+        pretrained_model = selected_pretrained_model
+        if verbose == True:
+            print("\n\n Model selection")
+            # https://www.sbert.net/docs/pretrained_models.html
+            print(pretrained_model)
+
+        model = SentenceTransformer(pretrained_model)
+        if verbose == True:
+            print(model)
+
+        corpus_embeddings = model.encode(self.corpus)
+        if verbose == True:
+            print("\n\n CORPUS EMBEDDING")
+            print(corpus_embeddings.shape)
+            print(corpus_embeddings)
+
+        print("\n\n Topic Modeling with BERTopic")
+
+        sentence_model = SentenceTransformer(pretrained_model)
+        if verbose == True:
+            print(sentence_model)
+
+        topic_model = BERTopic(embedding_model=sentence_model)
+        if verbose == True:
+            print(topic_model)
+
+        topics, _ = topic_model.fit_transform(self.corpus)
+        print(topic_model.get_topic_info()[:6])
+
+        corpus_labeled = pd.DataFrame({'ClusterLabel': topics, 'Sentence': self.corpus})
+        if verbose == True:
+            print("\n\n corpus_labeled = ")
+            print(corpus_labeled)
+
+        print("\n\n topics for each cluster = ")
+
+        i = 0
+        while i < len(topic_model.get_topic_info()):
+            print("Cluster #" + str(i) + " = ")
+            print(topic_model.get_topic(i))
+            i += 1
+
diff --git a/maryam/modules/iris/topicmodeling.py b/maryam/modules/iris/topicmodeling.py
@@ -0,0 +1,37 @@
+# TESTED USING =
+# topicmodeling -i mixed.json -t json -m all-distilroberta-v1
+# topicmodeling -i mixed.json -t json -s -v -m all-distilroberta-v1
+# topicmodeling -i mixed.json -t json -s -m all-distilroberta-v1
+# topicmodeling -i mixed.json -t json -v -m all-distilroberta-v1
+# topicmodeling -i testdataset.csv -t csv -m all-mpnet-base-v2
+# topicmodeling -i testdataset.csv -t csv -s -v -m all-mpnet-base-v2
+# topicmodeling -i testdataset.csv -t csv -s -m all-mpnet-base-v2
+# topicmodeling -i testdataset.csv -t csv -v -m all-mpnet-base-v2
+# Note: download the dataset for testing from https://github.com/keamanansiber/Maryam/tree/master/notebooks
+
+
+meta = {
+	'name': 'Topic Modeling',
+	'author': 'Hatma Suryotrisongko',
+	'version': '0.1.0',
+	'description': 'Topic Modeling Algorithms.',
+	'options': (
+		('inputfile', None, True, 'Input file that contains the data', '-i', 'store', str),
+		('filetype', None, True, 'File type: csv/json', '-t', 'store', str),
+		('showcharts', None, False, 'Show charts?', '-s', 'store_true', bool),
+		('verbose', None, False, 'Verbose output?', '-v', 'store_true', bool),
+		('pretrained_model', None, True, 'model for embedding', '-m', 'store', str),
+	),
+	'examples': ('topicmodeling -i mixed.json -t json -s True -v False -m all-mpnet-base-v2')
+}
+
+
+def module_api(self):
+
+	run = self.topic(self.options['inputfile'], self.options['filetype'], self.options['showcharts'], self.options['verbose'])
+	run.run_sklearn_cluster_kmeans(self.options['pretrained_model'], self.options['showcharts'], self.options['verbose'])
+	run.run_topic_modeling_bertopic(self.options['pretrained_model'], self.options['verbose'])
+
+
+def module_run(self):
+	module_api(self)