From 8601b77f61446ff7a92d4d33e39b8ccf6e9a51f3 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Fri, 16 Aug 2024 13:46:16 +0200
Subject: [PATCH 1/6] adding nmf version

---
 stream_topic/models/nmf.py | 325 +++++++++++++++++++++++++++++++++++++
 1 file changed, 325 insertions(+)
 create mode 100644 stream_topic/models/nmf.py

diff --git a/stream_topic/models/nmf.py b/stream_topic/models/nmf.py
new file mode 100644
index 0000000000..078d567298
--- /dev/null
+++ b/stream_topic/models/nmf.py
@@ -0,0 +1,325 @@
+import numpy as np
+import pandas as pd
+from datetime import datetime
+from sklearn.decomposition import NMF
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from scipy.sparse import csr_matrix
+from loguru import logger
+from ..preprocessor import c_tf_idf, extract_tfidf_topics
+from ..utils.dataset import TMDataset
+from .abstract_helper_models.base import BaseModel, TrainingStatus
+
+MODEL_NAME = "NMFTM"
+time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+
+class NMFTM(BaseModel):
+    """
+    A topic modeling class that uses Non-negative Matrix Factorization (NMF) to cluster text data into topics.
+
+    This class inherits from the BaseModel class and utilizes TF-IDF or Bag-of-Words for vectorization and NMF for dimensionality reduction and clustering.
+
+    Parameters
+    ----------
+    max_features : int
+        Maximum number of features used for vectorization.
+    nmf_args : dict
+        Arguments for NMF clustering.
+    use_tfidf : bool
+        If True, use TF-IDF vectorization; if False, use Bag-of-Words.
+    tfidf_args : dict
+        Arguments for TF-IDF vectorization.
+    random_state : int, optional
+        Random state for reproducibility.
+    """
+
+    def __init__(
+        self,
+        max_features: int = 5000,
+        nmf_args: dict = None,
+        use_tfidf: bool = True,
+        tfidf_args: dict = None,
+        random_state: int = None,
+        **kwargs,
+    ):
+        """
+        Initialize the NMF model.
+
+        Parameters
+        ----------
+        max_features : int, optional
+            Maximum number of features used for vectorization, by default 5000.
+        nmf_args : dict, optional
+            Arguments for NMF clustering, by default None.
+        use_tfidf : bool, optional
+            If True, use TF-IDF; otherwise, use Bag-of-Words, by default True.
+        tfidf_args : dict, optional
+            Arguments for TF-IDF vectorization, by default None.
+        random_state : int, optional
+            Random state for reproducibility, by default None.
+        **kwargs
+            Additional keyword arguments passed to the superclass.
+        """
+        super().__init__(use_pretrained_embeddings=False, **kwargs)
+        self.save_hyperparameters(ignore=["random_state"])
+
+        self.hparams = {
+            "max_features": max_features,
+            "nmf_args": nmf_args or {},
+            "tfidf_args": tfidf_args
+            or {
+                "max_df": 0.95,
+                "min_df": 2,
+                "max_features": max_features,
+            },
+        }
+
+        if random_state is not None:
+            self.hparams["nmf_args"]["random_state"] = random_state
+
+        # Choose vectorizer based on the user's preference
+        if use_tfidf:
+            self.vectorizer = TfidfVectorizer(**self.hparams["tfidf_args"])
+        else:
+            self.vectorizer = CountVectorizer(max_features=max_features)
+
+        self._status = TrainingStatus.NOT_STARTED
+        self.nmf_model = None
+        self.use_tfidf = use_tfidf
+
+    def get_info(self):
+        """
+        Get information about the model.
+
+        Returns
+        -------
+        dict
+            Dictionary containing model information including model name, vectorization, and clustering arguments, and training status.
+        """
+        info = {
+            "model_name": MODEL_NAME,
+            "nmf_args": self.hparams["nmf_args"],
+            "vectorizer": "TF-IDF" if self.use_tfidf else "Bag-of-Words",
+            "tfidf_args": self.hparams["tfidf_args"],
+            "trained_status": self._status.name,
+        }
+        return info
+
+    def _clustering(self, matrix):
+        """
+        Applies NMF clustering to the matrix.
+
+        Parameters
+        ----------
+        matrix : sparse matrix
+            The matrix to apply NMF to.
+
+        Raises
+        ------
+        RuntimeError
+            If an error occurs during clustering.
+        """
+        try:
+            logger.info("--- Applying NMF clustering ---")
+            self.nmf_model = NMF(
+                n_components=self.n_topics,
+                **self.hparams["nmf_args"],
+            )
+
+            W = self.nmf_model.fit_transform(matrix)  # Document-topic matrix (Theta)
+            H = self.nmf_model.components_  # Topic-term matrix (Beta)
+
+            # Assigning attributes
+            self.labels = np.argmax(W, axis=1)
+            self.theta = W
+            self.beta = H
+
+        except Exception as e:
+            raise RuntimeError(f"Error in clustering: {e}") from e
+
+    def fit(self, dataset: TMDataset, n_topics: int = 20):
+        """
+        Trains the NMF topic model on the provided dataset.
+
+        Parameters
+        ----------
+        dataset : TMDataset
+            The dataset to train the model on.
+        n_topics : int, optional
+            Number of topics to extract, by default 20.
+
+        Raises
+        ------
+        RuntimeError
+            If the training fails due to an error.
+        """
+        assert isinstance(
+            dataset, TMDataset
+        ), "The dataset must be an instance of TMDataset."
+
+        self.n_topics = n_topics
+        self.dataset = dataset
+
+        self._status = TrainingStatus.RUNNING
+        try:
+            logger.info(f"--- Training {MODEL_NAME} topic model ---")
+            matrix = self.vectorizer.fit_transform(self.dataset.texts)
+            self._clustering(matrix)
+
+            # Prepare data for visualization
+            topic_data = pd.DataFrame(columns=["predictions", "text"])
+            for i in range(self.nmf_model.n_components_):
+                topic_texts = [
+                    self.dataset.texts[j]
+                    for j, z in enumerate(self.theta[:, i])
+                    if z > 0.1
+                ]
+                if not topic_texts:
+                    continue
+                aggregated_texts = " ".join(topic_texts)
+                new_row = pd.DataFrame({"predictions": [i], "text": [aggregated_texts]})
+                topic_data = pd.concat([topic_data, new_row], ignore_index=True)
+
+            if topic_data.empty:
+                raise RuntimeError("No topics were extracted, model training failed.")
+
+            tfidf, count = c_tf_idf(
+                topic_data["text"].tolist(), len(self.dataset.texts)
+            )
+            self.topic_dict = extract_tfidf_topics(tfidf, count, topic_data)
+
+        except Exception as e:
+            logger.error(f"Error in training: {e}")
+            self._status = TrainingStatus.FAILED
+            raise
+        except KeyboardInterrupt:
+            logger.error("Training interrupted.")
+            self._status = TrainingStatus.INTERRUPTED
+            raise
+
+        logger.info("--- Training completed successfully. ---")
+        self._status = TrainingStatus.SUCCEEDED
+
+    def predict(self, texts):
+        """
+        Predict topics for new documents based on their text.
+
+        Parameters
+        ----------
+        texts : list of str
+            List of texts to predict topics for.
+
+        Returns
+        -------
+        list of int
+            List of predicted topic labels.
+        """
+        if self._status != TrainingStatus.SUCCEEDED:
+            raise RuntimeError("Model has not been trained yet or failed.")
+        matrix = self.vectorizer.transform(texts)
+        W = self.nmf_model.transform(matrix)
+        return np.argmax(W, axis=1)
+
+    def optimize_and_fit(
+        self,
+        dataset,
+        min_topics=2,
+        max_topics=20,
+        criterion="recon",
+        n_trials=100,
+        custom_metric=None,
+    ):
+        """
+        A new method in the child class that calls the parent class's optimize_hyperparameters method.
+
+        Parameters
+        ----------
+        dataset : TMDataset
+            The dataset to train the model on.
+        min_topics : int, optional
+            Minimum number of topics to evaluate, by default 2.
+        max_topics : int, optional
+            Maximum number of topics to evaluate, by default 20.
+        criterion : str, optional
+            Criterion to use for optimization ('aic', 'bic', or 'custom'), by default 'aic'.
+        n_trials : int, optional
+            Number of trials for optimization, by default 100.
+        custom_metric : object, optional
+            Custom metric object with a `score` method for evaluation, by default None.
+
+        Returns
+        -------
+        dict
+            Dictionary containing the best parameters and the optimal number of topics.
+        """
+        best_params = super().optimize_hyperparameters(
+            dataset=dataset,
+            min_topics=min_topics,
+            max_topics=max_topics,
+            criterion=criterion,
+            n_trials=n_trials,
+            custom_metric=custom_metric,
+        )
+
+        return best_params
+
+    def reconstruction_loss(self):
+        """
+        Calculate the reconstruction loss (Frobenius norm) for the NMF model.
+
+        Returns
+        -------
+        float
+            Reconstruction loss (Frobenius norm) of the NMF model.
+        """
+        # Ensure the NMF model has been trained
+        if self.nmf_model is None:
+            raise ValueError("NMF model has not been trained yet.")
+
+        # Get the original matrix
+        original_matrix = self.vectorizer.transform(self.dataset.texts)
+
+        # Reconstruct the matrix using W and H
+        reconstructed_matrix = np.dot(self.theta, self.beta)
+
+        # Calculate the Frobenius norm of the difference
+        reconstruction_loss = np.linalg.norm(
+            original_matrix - reconstructed_matrix, "fro"
+        )
+
+        return reconstruction_loss
+
+    def suggest_hyperparameters(self, trial):
+        """
+        Suggest hyperparameters for the NMF model.
+
+        Parameters
+        ----------
+        trial : optuna.trial.Trial
+            A single trial of the optuna optimization process.
+
+        Returns
+        -------
+        None
+        """
+
+        # Suggest NMF parameters
+        # self.hparams["nmf_args"]["alpha_W"] = trial.suggest_float("alpha_W", 0.0, 0.1)
+        # self.hparams["nmf_args"]["alpha_H"] = trial.suggest_float("alpha_H", 0.0, 0.1)
+        self.hparams["nmf_args"]["l1_ratio"] = trial.suggest_float("l1_ratio", 0.0, 1.0)
+        self.hparams["nmf_args"]["init"] = trial.suggest_categorical(
+            "init", ["random", "nndsvd", "nndsvda", "nndsvdar"]
+        )
+        self.hparams["nmf_args"]["max_iter"] = trial.suggest_int("max_iter", 200, 1000)
+        # self.hparams["nmf_args"]["beta_loss"] = trial.suggest_categorical(
+        #    "beta_loss", ["frobenius", "kullback-leibler", "itakura-saito"]
+        # )
+        self.hparams["nmf_args"]["solver"] = trial.suggest_categorical(
+            "solver", ["cd", "mu"]
+        )
+
+        # if self.hparams["nmf_args"]["beta_loss"] in [
+        #    "kullback-leibler",
+        #    "itakura-saito",
+        # ]:
+        #    self.hparams["nmf_args"]["solver"] = "mu"

From f631940214ebe06dae095e9e1b168f2d77dc1c0a Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Fri, 16 Aug 2024 13:46:27 +0200
Subject: [PATCH 2/6] adding nmf to __init__

---
 stream_topic/models/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/stream_topic/models/__init__.py b/stream_topic/models/__init__.py
index c6b1bc7ef2..cc3f27e581 100644
--- a/stream_topic/models/__init__.py
+++ b/stream_topic/models/__init__.py
@@ -12,6 +12,7 @@
 from .neurallda import NeuralLDA
 from .ctmneg import CTMNeg
 from .tntm import TNTM
+from .nmf import NMFTM
 
 __all__ = [
     "BERTopicTM",
@@ -28,4 +29,5 @@
     "NeuralLDA",
     "CTMNeg",
     "TNTM",
+    "NMFTM",
 ]

From 562e0548a7159fcaa2334fb94a4c1eb7ee85b9c4 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Fri, 16 Aug 2024 13:46:37 +0200
Subject: [PATCH 3/6] adjusting base hpo for reconstruction loss

---
 stream_topic/models/abstract_helper_models/base.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/stream_topic/models/abstract_helper_models/base.py b/stream_topic/models/abstract_helper_models/base.py
index 65ebacc92e..09f492abca 100644
--- a/stream_topic/models/abstract_helper_models/base.py
+++ b/stream_topic/models/abstract_helper_models/base.py
@@ -343,8 +343,9 @@ def optimize_hyperparameters(
         assert criterion in [
             "aic",
             "bic",
+            "recon",
             "custom",
-        ], "Criterion must be either 'aic', 'bic', or 'custom'."
+        ], "Criterion must be either 'aic', 'bic', 'recon' or 'custom'."
         if criterion == "custom":
             assert (
                 custom_metric is not None
@@ -363,12 +364,14 @@ def objective(trial):
             self.fit(dataset)
 
             # Calculate the score based on the criterion
-            if criterion in ["aic", "bic"]:
+            if criterion in ["aic", "bic", "recon"]:
 
                 if criterion == "aic":
                     score = self.calculate_aic(n_topics=self.hparams["n_topics"])
-                else:
+                elif criterion == "bic":
                     score = self.calculate_bic(n_topics=self.hparams["n_topics"])
+                elif criterion == "recon":
+                    score = self.reconstruction_loss()
             else:
                 # Compute the custom metric score
                 topics = self.get_topics()

From 98f6215916108c07d1e1d869e2e9d511dbf21701 Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Fri, 16 Aug 2024 14:01:25 +0200
Subject: [PATCH 4/6] adjusting docs

---
 docs/api/models/models.rst | 14 +++++++++-----
 docs/landingpage.md        |  4 ++++
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/docs/api/models/models.rst b/docs/api/models/models.rst
index b805adb7d2..530848ce37 100644
--- a/docs/api/models/models.rst
+++ b/docs/api/models/models.rst
@@ -2,11 +2,6 @@ Models
 ========
 
 
-.. autoclass:: stream_topic.models.BERTopicTM
-    :members:
-
-.. autoclass:: stream_topic.models.CBC
-    :members:
 
 .. autoclass:: stream_topic.models.CEDC
     :members:
@@ -40,3 +35,12 @@ Models
 
 .. autoclass:: stream_topic.models.CTMNeg
     :members:
+
+.. autoclass:: stream_topic.models.NMFTM
+    :members:
+
+.. autoclass:: stream_topic.models.BERTopicTM
+    :members:
+
+.. autoclass:: stream_topic.models.CBC
+    :members:
diff --git a/docs/landingpage.md b/docs/landingpage.md
index 2eed225e2c..27c8689b11 100644
--- a/docs/landingpage.md
+++ b/docs/landingpage.md
@@ -51,6 +51,10 @@ STREAM offers a variety of neural as well as non-neural topic models and we are
         <td><a href="https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf?ref=http://githubhelp.com">LDA</a></td>
         <td>Latent Dirichlet Allocation</td>
       </tr>
+      <tr>
+        <td><a href="https://www.nature.com/articles/44565">NMF</a></td>
+        <td>Non-negative Matrix Factorization</td>
+      </tr>
       <tr>
         <td><a href="https://arxiv.org/abs/2004.14914">WordCluTM</a></td>
         <td>Tired of topic models?</td>

From df192219ecbde6383ab9bc477a3030d96ccf0b1b Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Fri, 16 Aug 2024 14:01:43 +0200
Subject: [PATCH 5/6] including NMF in readme

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 1540d88751..bbac07c099 100644
--- a/README.md
+++ b/README.md
@@ -102,6 +102,10 @@ STREAM offers a variety of neural as well as non-neural topic models and we are
         <td><a href="https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf?ref=http://githubhelp.com">LDA</a></td>
         <td>Latent Dirichlet Allocation</td>
       </tr>
+      <tr>
+        <td><a href="https://www.nature.com/articles/44565">NMF</a></td>
+        <td>Non-negative Matrix Factorization</td>
+      </tr>
       <tr>
         <td><a href="https://arxiv.org/abs/2004.14914">WordCluTM</a></td>
         <td>Tired of topic models?</td>

From fe943341f6de6cd7c6527dcf65059043b5d35fbd Mon Sep 17 00:00:00 2001
From: AFThielmann <antonthielmann@t-online.de>
Date: Fri, 16 Aug 2024 14:02:48 +0200
Subject: [PATCH 6/6] updating version number for new release

---
 stream_topic/__version__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stream_topic/__version__.py b/stream_topic/__version__.py
index a99a72e1f6..7408bb09b6 100644
--- a/stream_topic/__version__.py
+++ b/stream_topic/__version__.py
@@ -1,4 +1,4 @@
 """Version information."""
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.1.5"
+__version__ = "0.1.6"