diff --git a/.gitignore b/.gitignore
index ded9b01736..0ba0568bed 100644
--- a/.gitignore
+++ b/.gitignore
@@ -183,4 +183,11 @@ post-checkout
 post-commit
 post-merge
 pre-push
-docs/notebooks/lightning_logs/*
\ No newline at end of file
+docs/notebooks/lightning_logs/*
+docs/notebooks/lightning_logs
+docs/notebooks/data
+docs/notebooks/data/*
+docs/notebooks/embeddings
+docs/notebooks/embeddings/*
+docs/notebooks/checkpoints
+docs/notebooks/checkpoints/*
\ No newline at end of file
diff --git a/docs/notebooks/datasets.ipynb b/docs/notebooks/datasets.ipynb
index b4af4de829..764e080c66 100644
--- a/docs/notebooks/datasets.ipynb
+++ b/docs/notebooks/datasets.ipynb
@@ -4,8 +4,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/develop/docs/notebooks/datasets.ipynb)\n",
-    "[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/develop/docs/notebooks/datasets.ipynb)\n",
+    "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/main/docs/notebooks/datasets.ipynb)\n",
+    "[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/main/docs/notebooks/datasets.ipynb)\n",
     "\n",
     "# Datasets"
    ]
@@ -33,13 +33,25 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "# uncomment the below line if running in Colab\n",
+    "# package neeeds to be installed for the notebook to run\n",
+    "\n",
+    "# ! pip install -U stream_topic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/opt/homebrew/Caskroom/miniforge/base/envs/db/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
-      "  warnings.warn(\n"
+      "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
+      "  from tqdm.autonotebook import tqdm, trange\n"
      ]
     }
    ],
@@ -60,37 +72,6 @@
     "- these datasets are included in the package and can be loaded using the `TMDataset` module"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['Stocktwits_GME_large',\n",
-       " 'BBC_News',\n",
-       " 'Stocktwits_GME',\n",
-       " 'Reddit_GME',\n",
-       " 'Reuters',\n",
-       " 'Spotify',\n",
-       " '20NewsGroups',\n",
-       " 'DummyDataset',\n",
-       " 'Spotify_most_popular',\n",
-       " 'Poliblogs',\n",
-       " 'Spotify_least_popular']"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dataset = TMDataset()\n",
-    "dataset.get_dataset_list()"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 3,
@@ -100,12 +81,16 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2024-08-07 10:31:30.489\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m159\u001b[0m - \u001b[1mFetching dataset: Reuters\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:31:31.978\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mDataset loaded successfully from /opt/homebrew/Caskroom/miniforge/base/envs/db/lib/python3.10/site-packages/stream_topic/preprocessed_datasets/Reuters\u001b[0m\n"
+      "\u001b[32m2024-08-09 12:13:26.847\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: Reuters\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:26.914\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:27.147\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:27.313\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:27.456\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n"
      ]
     }
    ],
    "source": [
+    "dataset = TMDataset()\n",
     "dataset.fetch_dataset(name=\"Reuters\")"
    ]
   },
@@ -181,10 +166,13 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2024-08-07 10:31:33.085\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m151\u001b[0m - \u001b[1mDataset name already provided while instantiating the class: Reuters\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:31:33.086\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mOverwriting the dataset name with the provided name in fetch_dataset: Spotify\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:31:33.086\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m156\u001b[0m - \u001b[1mFetching dataset: Spotify\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:31:33.190\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mDataset loaded successfully from /opt/homebrew/Caskroom/miniforge/base/envs/db/lib/python3.10/site-packages/stream_topic/preprocessed_datasets/Spotify\u001b[0m\n"
+      "\u001b[32m2024-08-09 12:13:28.464\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m108\u001b[0m - \u001b[1mDataset name already provided while instantiating the class: Reuters\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.464\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m111\u001b[0m - \u001b[1mOverwriting the dataset name with the name provided in fetch_dataset: Spotify\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.465\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m115\u001b[0m - \u001b[1mFetching dataset: Spotify\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.539\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.749\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.923\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:29.058\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n"
      ]
     }
    ],
@@ -451,7 +439,7 @@
     {
      "data": {
       "text/plain": [
-       "[75, 58, 37, 45, 41]"
+       "[75, 58]"
       ]
      },
      "execution_count": 11,
@@ -460,7 +448,7 @@
     }
    ],
    "source": [
-    "dataset.labels[:5]"
+    "dataset.labels[:2]"
    ]
   },
   {
@@ -475,18 +463,6 @@
    "execution_count": 12,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "from stream_topic.utils import TMDataset\n",
-    "\n",
-    "import warnings\n",
-    "warnings.filterwarnings(\"ignore\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "import pandas as pd\n",
     "import numpy as np\n",
@@ -508,19 +484,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Preprocessing documents: 100%|██████████| 1000/1000 [00:03<00:00, 267.71it/s]\n",
-      "\u001b[32m2024-08-07 10:31:37.027\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m409\u001b[0m - \u001b[1mDataset save directory does not exist: data/\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:31:37.027\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m410\u001b[0m - \u001b[1mCreating directory: data/\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:31:37.031\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mDataset saved to data/sample_data.parquet\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:31:37.032\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m430\u001b[0m - \u001b[1mDataset info saved to data/sample_data_info.pkl\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:31:37.032\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mDataset name appended to avaliable datasets list: ['Stocktwits_GME_large', 'BBC_News', 'Stocktwits_GME', 'Reddit_GME', 'Reuters', 'Spotify', '20NewsGroups', 'DummyDataset', 'Spotify_most_popular', 'Poliblogs', 'Spotify_least_popular', 'sample_data']\u001b[0m\n"
+      "Preprocessing documents: 100%|██████████| 1000/1000 [00:03<00:00, 263.32it/s]\n",
+      "\u001b[32m2024-08-09 12:13:32.967\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m237\u001b[0m - \u001b[1mDataset saved to data/sample_data.parquet\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:32.968\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mDataset info saved to data/sample_data_info.pkl\u001b[0m\n"
      ]
     }
    ],
@@ -537,27 +510,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2024-08-07 10:31:37.036\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m159\u001b[0m - \u001b[1mFetching dataset: sample_data\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:31:37.045\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mDataset loaded successfully from data/\u001b[0m\n"
+      "\u001b[32m2024-08-09 12:13:32.972\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: sample_data\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:32.973\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m128\u001b[0m - \u001b[1mFetching dataset from local path\u001b[0m\n"
      ]
     }
    ],
    "source": [
     "# the new data is saved in the data folder unlike the default datasets which are saved in package directory under preprocessed_data folder.\n",
     "# therefore, you need to provide the path to the data folder to fetch the dataset\n",
-    "dataset.fetch_dataset(name=\"sample_data\", dataset_path=\"data/\")"
+    "dataset.fetch_dataset(name=\"sample_data\", dataset_path=\"data/\", source=\"local\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -630,7 +603,7 @@
        "4  BGHXO      3  [BGHXO]"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -639,6 +612,13 @@
     "dataset.dataframe.head()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -663,7 +643,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.0"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,
diff --git a/docs/notebooks/datasets.md b/docs/notebooks/datasets.md
deleted file mode 100644
index 32eb43deae..0000000000
--- a/docs/notebooks/datasets.md
+++ /dev/null
@@ -1,139 +0,0 @@
----
-jupytext:
-  text_representation:
-    extension: .md
-    format_name: myst
-    format_version: 0.13
-    jupytext_version: 1.16.4
-kernelspec:
-  display_name: topicm
-  language: python
-  name: python3
----
-
-[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/develop/docs/notebooks/datasets.ipynb)
-[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/develop/docs/notebooks/datasets.ipynb)
-
-# Datasets
-
-+++
-
-The dataset module provides and easy way to load and preprocess the datasets. The package comes with a few datasets that are commonly used in topic modleing research. The datasets are:
-
-    - 20NewsGroup
-    - BBC_News
-    - Stocktwits_GME
-    - Reddit_GME'
-    - Reuters'
-    - Spotify
-    - Spotify_most_popular
-    - Poliblogs
-    - Spotify_least_popular
-
-Please see the functionalities availabe in the `TMDataset` module.
-
-```{code-cell} ipython3
-from stream_topic.utils import TMDataset
-
-import warnings
-warnings.filterwarnings("ignore")
-```
-
-## Using default datasets
-
-- these datasets are already preprocessed and ready to be used for topic modeling
-- these datasets are included in the package and can be loaded using the `TMDataset` module
-
-```{code-cell} ipython3
-dataset = TMDataset()
-dataset.get_dataset_list()
-```
-
-```{code-cell} ipython3
-dataset.fetch_dataset(name="Reuters")
-```
-
-```{code-cell} ipython3
-dataset.get_bow()
-```
-
-```{code-cell} ipython3
-dataset.get_tfidf()
-```
-
-```{code-cell} ipython3
-# dataset.get_word_embeddings()
-```
-
-```{code-cell} ipython3
-dataset.fetch_dataset('Spotify')
-```
-
-```{code-cell} ipython3
-dataset.dataframe.head()
-```
-
-```{code-cell} ipython3
-dataset.texts[:2]
-```
-
-```{code-cell} ipython3
-dataset.tokens
-```
-
-```{code-cell} ipython3
-dataset.labels[:5]
-```
-
-## Loading own dataset
-
-```{code-cell} ipython3
-from stream_topic.utils import TMDataset
-
-import warnings
-warnings.filterwarnings("ignore")
-```
-
-```{code-cell} ipython3
-import pandas as pd
-import numpy as np
-
-
-# Simulating some example data
-np.random.seed(0)
-
-# Generate 1000 random strings of lengths between 1 and 5, containing letters 'A' to 'Z'
-random_documents = [''.join(np.random.choice(list('ABCDEFGHIJKLMNOPQRSTUVWXYZ'), 
-                                             np.random.randint(1, 6))) for _ in range(1000)]
-
-# Generate 1000 random labels from 1 to 4 as strings
-random_labels = np.random.choice(['1', '2', '3', '4'], 1000)
-
-# Create DataFrame
-my_data = pd.DataFrame({"Documents": random_documents, "Labels": random_labels})
-```
-
-```{code-cell} ipython3
-dataset = TMDataset()
-dataset.create_load_save_dataset(
-    data=my_data, 
-    dataset_name="sample_data",
-    save_dir="data/",
-    doc_column="Documents",
-    label_column="Labels"
-    )
-```
-
-```{code-cell} ipython3
-# the new data is saved in the data folder unlike the default datasets which are saved in package directory under preprocessed_data folder.
-# therefore, you need to provide the path to the data folder to fetch the dataset
-dataset.fetch_dataset(name="sample_data", dataset_path="data/")
-```
-
-```{code-cell} ipython3
-dataset.dataframe.head()
-```
-
-```{code-cell} ipython3
-
-```
diff --git a/docs/notebooks/examples.ipynb b/docs/notebooks/examples.ipynb
index 04486e068d..254270423b 100644
--- a/docs/notebooks/examples.ipynb
+++ b/docs/notebooks/examples.ipynb
@@ -4,17 +4,38 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/develop/docs/notebooks/examples.ipynb)\n",
-    "[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/develop/docs/notebooks/examples.ipynb)\n",
+    "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/main/docs/notebooks/examples.ipynb)\n",
+    "[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/main/docs/notebooks/examples.ipynb)\n",
     "\n",
     "# Examples"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "# uncomment the below line if running in Colab\n",
+    "# package neeeds to be installed for the notebook to run\n",
+    "\n",
+    "# ! pip install -U stream_topic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
+      "  from tqdm.autonotebook import tqdm, trange\n"
+     ]
+    }
+   ],
    "source": [
     "from stream_topic.models import KmeansTM\n",
     "from stream_topic.utils import TMDataset"
@@ -29,16 +50,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2024-08-07 10:42:28.407\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m162\u001b[0m - \u001b[1mFetching dataset: BBC_News\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:42:28.494\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m168\u001b[0m - \u001b[1mDataset loaded successfully from c:\\users\\anton\\desktop\\projects\\stream\\stream_topic\\preprocessed_datasets\\BBC_News\u001b[0m\n",
-      "Preprocessing documents: 100%|██████████| 2225/2225 [00:29<00:00, 75.02it/s] \n"
+      "\u001b[32m2024-08-09 12:13:58.725\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: BBC_News\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:58.815\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:59.016\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:59.135\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:59.267\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "Preprocessing documents: 100%|██████████| 2225/2225 [00:11<00:00, 198.75it/s]\n"
      ]
     }
    ],
@@ -50,80 +74,103 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "[I 2024-08-07 10:42:58,290] A new study created in memory with name: no-name-c7f401f5-97c5-4cdb-857b-6f50683772d8\n",
-      "\u001b[32m2024-08-07 10:42:58.290\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:42:58.290\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:42:58.290\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:14.207\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:15.111\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
-      "[I 2024-08-07 10:43:15,111] Trial 0 finished with value: -2908.3807001524565 and parameters: {'n_topics': 5, 'n_neighbors': 25, 'n_components': 45, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 12, 'max_iter': 521}. Best is trial 0 with value: -2908.3807001524565.\n",
-      "\u001b[32m2024-08-07 10:43:15.111\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:15.111\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:15.111\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:24.245\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:25.067\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
-      "[I 2024-08-07 10:43:25,069] Trial 1 finished with value: -3369.708145534729 and parameters: {'n_topics': 8, 'n_neighbors': 44, 'n_components': 15, 'metric': 'euclidean', 'init': 'random', 'n_init': 18, 'max_iter': 719}. Best is trial 1 with value: -3369.708145534729.\n",
-      "\u001b[32m2024-08-07 10:43:25.072\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:25.072\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:25.073\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:31.847\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:32.734\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
-      "[I 2024-08-07 10:43:32,734] Trial 2 finished with value: -2352.3949029947194 and parameters: {'n_topics': 13, 'n_neighbors': 10, 'n_components': 11, 'metric': 'euclidean', 'init': 'k-means++', 'n_init': 11, 'max_iter': 747}. Best is trial 1 with value: -3369.708145534729.\n",
-      "\u001b[32m2024-08-07 10:43:32.743\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:32.743\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:32.743\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:43.007\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:44.275\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
-      "[I 2024-08-07 10:43:44,275] Trial 3 finished with value: -3050.2628007635785 and parameters: {'n_topics': 8, 'n_neighbors': 30, 'n_components': 31, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 30, 'max_iter': 399}. Best is trial 1 with value: -3369.708145534729.\n",
-      "\u001b[32m2024-08-07 10:43:44.275\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:44.275\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:44.275\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:53.013\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:53.935\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
-      "[I 2024-08-07 10:43:53,936] Trial 4 finished with value: -3077.4218904505196 and parameters: {'n_topics': 4, 'n_neighbors': 31, 'n_components': 32, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 21, 'max_iter': 228}. Best is trial 1 with value: -3369.708145534729.\n",
-      "\u001b[32m2024-08-07 10:43:53.936\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:53.936\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:43:53.936\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:04.137\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:05.323\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
-      "[I 2024-08-07 10:44:05,325] Trial 5 finished with value: -3316.4025635845546 and parameters: {'n_topics': 8, 'n_neighbors': 47, 'n_components': 39, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 22, 'max_iter': 664}. Best is trial 1 with value: -3369.708145534729.\n",
-      "\u001b[32m2024-08-07 10:44:05.325\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:05.325\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:05.325\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:14.288\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:15.314\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
-      "[I 2024-08-07 10:44:15,314] Trial 6 finished with value: -2375.983772695624 and parameters: {'n_topics': 3, 'n_neighbors': 14, 'n_components': 35, 'metric': 'euclidean', 'init': 'random', 'n_init': 20, 'max_iter': 889}. Best is trial 1 with value: -3369.708145534729.\n",
-      "\u001b[32m2024-08-07 10:44:15.314\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:15.314\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:15.314\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:23.368\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:24.310\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
-      "[I 2024-08-07 10:44:24,310] Trial 7 finished with value: -2731.111202206049 and parameters: {'n_topics': 15, 'n_neighbors': 19, 'n_components': 10, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 14, 'max_iter': 492}. Best is trial 1 with value: -3369.708145534729.\n",
-      "\u001b[32m2024-08-07 10:44:24.310\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:24.310\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:24.310\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:31.870\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:32.529\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
-      "[I 2024-08-07 10:44:32,529] Trial 8 finished with value: -1749.2967260199991 and parameters: {'n_topics': 13, 'n_neighbors': 10, 'n_components': 18, 'metric': 'cosine', 'init': 'random', 'n_init': 11, 'max_iter': 412}. Best is trial 1 with value: -3369.708145534729.\n",
-      "\u001b[32m2024-08-07 10:44:32.529\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:32.529\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:32.529\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:41.535\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:42.270\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
-      "[I 2024-08-07 10:44:42,270] Trial 9 finished with value: -2846.653885685785 and parameters: {'n_topics': 15, 'n_neighbors': 29, 'n_components': 29, 'metric': 'cosine', 'init': 'random', 'n_init': 21, 'max_iter': 428}. Best is trial 1 with value: -3369.708145534729.\n",
-      "\u001b[32m2024-08-07 10:44:42.270\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36moptimize_hyperparameters\u001b[0m:\u001b[36m389\u001b[0m - \u001b[1mOptimal parameters: {'n_neighbors': 44, 'n_components': 15, 'metric': 'euclidean', 'init': 'random', 'n_init': 18, 'max_iter': 719} with 8 topics based on AIC.\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:42.270\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:42.270\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:42.270\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:51.236\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:44:51.937\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n"
+      "[I 2024-08-09 12:14:10,511] A new study created in memory with name: no-name-5aa7ebf6-4f37-4290-86e6-cebeec5398df\n",
+      "\u001b[32m2024-08-09 12:14:10.513\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:10.577\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:10.648\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:10.866\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings  downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:10.875\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
+      "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n",
+      "\u001b[32m2024-08-09 12:14:16.104\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:16.705\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
+      "[I 2024-08-09 12:14:16,706] Trial 0 finished with value: -2747.044881484823 and parameters: {'n_topics': 4, 'n_neighbors': 24, 'n_components': 30, 'metric': 'cosine', 'init': 'random', 'n_init': 16, 'max_iter': 499}. Best is trial 0 with value: -2747.044881484823.\n",
+      "\u001b[32m2024-08-09 12:14:16.708\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:16.778\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:16.847\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:17.117\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings  downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:17.118\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:19.615\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:20.994\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
+      "[I 2024-08-09 12:14:20,995] Trial 1 finished with value: -2453.3441802457037 and parameters: {'n_topics': 13, 'n_neighbors': 11, 'n_components': 25, 'metric': 'euclidean', 'init': 'k-means++', 'n_init': 18, 'max_iter': 191}. Best is trial 0 with value: -2747.044881484823.\n",
+      "\u001b[32m2024-08-09 12:14:20.997\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:21.075\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:21.154\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:21.376\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings  downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:21.377\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:24.749\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:25.399\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
+      "[I 2024-08-09 12:14:25,400] Trial 2 finished with value: -3128.164941984999 and parameters: {'n_topics': 12, 'n_neighbors': 38, 'n_components': 43, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 10, 'max_iter': 829}. Best is trial 2 with value: -3128.164941984999.\n",
+      "\u001b[32m2024-08-09 12:14:25.401\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:25.474\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:25.570\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:25.790\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings  downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:25.791\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:28.291\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:28.612\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
+      "[I 2024-08-09 12:14:28,613] Trial 3 finished with value: -3134.0457564239455 and parameters: {'n_topics': 7, 'n_neighbors': 29, 'n_components': 32, 'metric': 'euclidean', 'init': 'random', 'n_init': 15, 'max_iter': 318}. Best is trial 3 with value: -3134.0457564239455.\n",
+      "\u001b[32m2024-08-09 12:14:28.614\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:28.685\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:28.753\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:28.952\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings  downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:28.953\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:32.261\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:33.410\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
+      "[I 2024-08-09 12:14:33,411] Trial 4 finished with value: -3016.9434231651817 and parameters: {'n_topics': 15, 'n_neighbors': 31, 'n_components': 45, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 19, 'max_iter': 481}. Best is trial 3 with value: -3134.0457564239455.\n",
+      "\u001b[32m2024-08-09 12:14:33.413\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:33.485\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:33.564\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:33.776\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings  downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:33.777\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:36.292\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:36.600\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
+      "[I 2024-08-09 12:14:36,601] Trial 5 finished with value: -3055.0335652501776 and parameters: {'n_topics': 18, 'n_neighbors': 29, 'n_components': 8, 'metric': 'euclidean', 'init': 'random', 'n_init': 12, 'max_iter': 496}. Best is trial 3 with value: -3134.0457564239455.\n",
+      "\u001b[32m2024-08-09 12:14:36.602\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:36.678\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:36.759\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:36.979\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings  downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:36.980\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:39.683\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:40.085\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
+      "[I 2024-08-09 12:14:40,086] Trial 6 finished with value: -3156.918997163559 and parameters: {'n_topics': 17, 'n_neighbors': 31, 'n_components': 30, 'metric': 'euclidean', 'init': 'random', 'n_init': 28, 'max_iter': 164}. Best is trial 6 with value: -3156.918997163559.\n",
+      "\u001b[32m2024-08-09 12:14:40.088\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:40.158\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:40.233\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:40.464\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings  downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:40.465\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:43.529\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:45.069\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
+      "[I 2024-08-09 12:14:45,070] Trial 7 finished with value: -2489.982411751191 and parameters: {'n_topics': 9, 'n_neighbors': 15, 'n_components': 35, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 25, 'max_iter': 457}. Best is trial 6 with value: -3156.918997163559.\n",
+      "\u001b[32m2024-08-09 12:14:45.072\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:45.145\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:45.224\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:45.458\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings  downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:45.460\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:48.272\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:48.598\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
+      "[I 2024-08-09 12:14:48,599] Trial 8 finished with value: -1863.7949187027652 and parameters: {'n_topics': 8, 'n_neighbors': 10, 'n_components': 17, 'metric': 'cosine', 'init': 'random', 'n_init': 20, 'max_iter': 354}. Best is trial 6 with value: -3156.918997163559.\n",
+      "\u001b[32m2024-08-09 12:14:48.600\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:48.668\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:48.750\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:48.978\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings  downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:48.979\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:52.247\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:52.643\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n",
+      "[I 2024-08-09 12:14:52,644] Trial 9 finished with value: -3223.5689310849125 and parameters: {'n_topics': 11, 'n_neighbors': 42, 'n_components': 8, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 25, 'max_iter': 568}. Best is trial 9 with value: -3223.5689310849125.\n",
+      "\u001b[32m2024-08-09 12:14:52.644\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36moptimize_hyperparameters\u001b[0m:\u001b[36m389\u001b[0m - \u001b[1mOptimal parameters: {'n_neighbors': 42, 'n_components': 8, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 25, 'max_iter': 568} with 11 topics based on AIC.\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:52.645\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:52.939\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:53.010\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:53.214\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings  downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:53.215\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:56.152\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:14:56.525\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n"
      ]
     }
    ],
@@ -134,14 +181,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "8\n"
+      "11\n"
      ]
     }
    ],
@@ -159,16 +206,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
-       "0.15247"
+       "0.23371"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -182,16 +237,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "0.20205374993383884"
+       "0.2090254547921094"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -200,13 +255,20 @@
     "isim_metric = ISIM()\n",
     "isim_metric.score(topics)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python (stream_topic_venv)",
+   "display_name": "db",
    "language": "python",
-   "name": "stream_topic_venv"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
diff --git a/docs/notebooks/quickstart.ipynb b/docs/notebooks/quickstart.ipynb
index 8eabe8e700..8286df3f5f 100644
--- a/docs/notebooks/quickstart.ipynb
+++ b/docs/notebooks/quickstart.ipynb
@@ -4,12 +4,24 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/develop/docs/notebooks/quickstart.ipynb)\n",
-    "[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/develop/docs/notebooks/quickstart.ipynb)\n",
+    "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/main/docs/notebooks/quickstart.ipynb)\n",
+    "[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/main/docs/notebooks/quickstart.ipynb)\n",
     "\n",
     "# Quickstart"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# uncomment the below line if running in Colab\n",
+    "# package neeeds to be installed for the notebook to run\n",
+    "\n",
+    "# ! pip install -U stream_topic"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 8,
@@ -36,6 +48,12 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "\u001b[32m2024-08-09 12:34:06.391\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: BBC_News\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:06.592\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:06.796\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:07.111\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:07.250\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "Preprocessing documents: 100%|██████████| 2225/2225 [00:11<00:00, 200.32it/s]\n"
      ]
     }
    ],
@@ -54,13 +72,18 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2024-08-07 10:53:54.527\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m241\u001b[0m - \u001b[1m--- Training CEDC topic model ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:53:54.527\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:53:54.537\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:54:03.304\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m175\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:54:05.490\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m259\u001b[0m - \u001b[1m--- Extract topics ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:54:21.731\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m284\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n"
-
+      "\u001b[32m2024-08-09 12:34:18.398\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m241\u001b[0m - \u001b[1m--- Training CEDC topic model ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:18.695\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:18.781\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:18.993\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings  downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:19.000\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:22.191\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m175\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n",
+      "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning:\n",
+      "\n",
+      "`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "\n",
+      "\u001b[32m2024-08-09 12:34:27.876\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m259\u001b[0m - \u001b[1m--- Extract topics ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:32.056\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m284\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n"
      ]
     }
    ],
@@ -72,9 +95,18 @@
   {
    "cell_type": "code",
    "execution_count": 11,
-
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/umap/umap_.py:2437: UserWarning:\n",
+      "\n",
+      "n_neighbors is larger than the dataset size; truncating to X.shape[0] - 1\n",
+      "\n"
+     ]
+    },
     {
      "data": {
       "text/html": [
@@ -90,8 +122,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x281e4ed71c0>"
-
+       "<IPython.lib.display.IFrame at 0x4468dca60>"
       ]
      },
      "metadata": {},
@@ -118,24 +149,32 @@
   {
    "cell_type": "code",
    "execution_count": 12,
-
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2024-08-07 10:54:31.049\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m162\u001b[0m - \u001b[1mFetching dataset: BBC_News\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:54:31.138\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m168\u001b[0m - \u001b[1mDataset loaded successfully from c:\\users\\anton\\desktop\\projects\\stream\\stream_topic\\preprocessed_datasets\\BBC_News\u001b[0m\n",
-      "Preprocessing documents: 100%|██████████| 2225/2225 [00:24<00:00, 91.41it/s] \n",
-      "\u001b[32m2024-08-07 10:54:55.545\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:54:55.545\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36m_initialize_datamodule\u001b[0m:\u001b[36m314\u001b[0m - \u001b[1m--- Initializing Datamodule for CTMNeg ---\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:54:56.023\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36m_initialize_trainer\u001b[0m:\u001b[36m273\u001b[0m - \u001b[1m--- Initializing Trainer for CTMNeg ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:35.593\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: BBC_News\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:35.670\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:35.869\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:35.975\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:36.099\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "Preprocessing documents: 100%|██████████| 2225/2225 [00:11<00:00, 198.65it/s]\n",
+      "\u001b[32m2024-08-09 12:34:47.410\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:47.483\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:47.709\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings  downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:47.717\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36m_initialize_datamodule\u001b[0m:\u001b[36m314\u001b[0m - \u001b[1m--- Initializing Datamodule for CTMNeg ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:48.183\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36m_initialize_trainer\u001b[0m:\u001b[36m273\u001b[0m - \u001b[1m--- Initializing Trainer for CTMNeg ---\u001b[0m\n",
       "Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.\n",
-      "GPU available: False, used: False\n",
+      "GPU available: True (mps), used: True\n",
       "TPU available: False, using: 0 TPU cores\n",
       "HPU available: False, using: 0 HPUs\n",
-      "\u001b[32m2024-08-07 10:54:56.101\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m457\u001b[0m - \u001b[1m--- Training CTMNeg topic model ---\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:34:48.201\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m457\u001b[0m - \u001b[1m--- Training CTMNeg topic model ---\u001b[0m\n",
+      "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:652: UserWarning:\n",
+      "\n",
+      "Checkpoint directory /Volumes/Research/Repositories/STREAM/docs/notebooks/checkpoints exists and is not empty.\n",
+      "\n",
       "\n",
       "  | Name                    | Type              | Params | Mode \n",
       "----------------------------------------------------------------------\n",
@@ -154,24 +193,66 @@
      ]
     },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Epoch 0: 100%|██████████| 56/56 [00:05<00:00, 10.92it/s, v_num=1, train_loss_step=nan.0, val_loss_step=nan.0, val_loss_epoch=nan.0, train_loss_epoch=nan.0]"
-     ]
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "637189591d0e4df98eece6259647b46b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Sanity Checking: |          | 0/? [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2024-08-07 10:55:01.300\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m473\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n"
+      "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: PossibleUserWarning:\n",
+      "\n",
+      "The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=13` in the `DataLoader` to improve performance.\n",
+      "\n",
+      "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: PossibleUserWarning:\n",
+      "\n",
+      "The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=13` in the `DataLoader` to improve performance.\n",
+      "\n"
      ]
     },
     {
-     "name": "stdout",
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a4081cb63c984267a05eb10c8cbd97bc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Training: |          | 0/? [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4cd65f03f3af415699e3a9f81fef9338",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Validation: |          | 0/? [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\n"
+      "\u001b[32m2024-08-09 12:34:49.580\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m473\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n"
      ]
     }
    ],
@@ -187,9 +268,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python (stream_topic_venv)",
+   "display_name": "db",
    "language": "python",
-   "name": "stream_topic_venv"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -201,7 +282,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.0"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,
diff --git a/docs/notebooks/quickstart.md b/docs/notebooks/quickstart.md
deleted file mode 100644
index 5d3cdd73fb..0000000000
--- a/docs/notebooks/quickstart.md
+++ /dev/null
@@ -1,68 +0,0 @@
----
-jupytext:
-  text_representation:
-    extension: .md
-    format_name: myst
-    format_version: 0.13
-    jupytext_version: 1.16.4
-kernelspec:
-  display_name: topicm
-  language: python
-  name: python3
----
-
-[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/develop/docs/notebooks/quickstart.ipynb)
-[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/develop/docs/notebooks/quickstart.ipynb)
-
-# Quickstart
-
-```{code-cell} ipython3
-from stream_topic.models import CEDC, DCTE
-from stream_topic.utils import TMDataset
-
-
-import warnings
-warnings.filterwarnings("ignore")
-```
-
-## CEDC model
-
-```{code-cell} ipython3
-dataset = TMDataset()
-dataset.fetch_dataset("DummyDataset")
-```
-
-```{code-cell} ipython3
-model = CEDC(num_topics=10)
-output = model.fit(dataset)
-```
-
-```{code-cell} ipython3
-from stream_topic.visuals import visualize_topic_model, visualize_topics
-
-visualize_topic_model(
-    model, 
-    reduce_first=True, 
-    port=8052,
-    )
-```
-
-## KMeansTM model
-
-```{code-cell} ipython3
-from stream_topic.models import KmeansTM
-model = KmeansTM(num_topics=10)
-output = model.fit(dataset)
-```
-
-```{code-cell} ipython3
-visualize_topic_model(
-    model, 
-    reduce_first=True, 
-    port=8053,
-    )
-```
-
-```{code-cell} ipython3
-
-```
diff --git a/stream/models/ctmneg_utils/__init__.py b/stream/models/ctmneg_utils/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/stream_topic/__init__.py b/stream_topic/__init__.py
index 3e792a3fdc..5ad1766380 100644
--- a/stream_topic/__init__.py
+++ b/stream_topic/__init__.py
@@ -1,4 +1,4 @@
-from . import NAM, metrics, models, preprocessor, utils, visuals
+# from . import NAM, metrics, models, preprocessor, utils, visuals
 from .__version__ import __version__  # noqa: F401
 
-__all__ = ["NAM", "metrics", "models", "preprocessor", "utils", "visuals"]
+__all__ = ["__version__"]
diff --git a/stream_topic/__version__.py b/stream_topic/__version__.py
index 0fcbc6fceb..d82b41ac48 100644
--- a/stream_topic/__version__.py
+++ b/stream_topic/__version__.py
@@ -1,4 +1,4 @@
 """Version information."""
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.1.3"
+__version__ = "0.1.4"
diff --git a/stream_topic/metrics/constants.py b/stream_topic/metrics/constants.py
index baa5319c0d..b5ccb2d398 100644
--- a/stream_topic/metrics/constants.py
+++ b/stream_topic/metrics/constants.py
@@ -1,4 +1,4 @@
 PARAPHRASE_TRANSFORMER_MODEL = "paraphrase-MiniLM-L3-v2"
 SENTENCE_TRANSFORMER_MODEL = "all-MiniLM-L6-v2"
-EMBEDDING_PATH = "/embeddings"
+EMBEDDING_PATH = "embeddings"
 NLTK_STOPWORD_LANGUAGE = "english"
diff --git a/stream_topic/pre_embedded_datasets/BBC_News/BBC_News_embeddings_paraphrase-MiniLM-L3-v2.pkl b/stream_topic/pre_embedded_datasets/BBC_News/BBC_News_embeddings_paraphrase-MiniLM-L3-v2.pkl
deleted file mode 100644
index b53278f036..0000000000
Binary files a/stream_topic/pre_embedded_datasets/BBC_News/BBC_News_embeddings_paraphrase-MiniLM-L3-v2.pkl and /dev/null differ
diff --git a/stream_topic/pre_embedded_datasets/Poliblogs/Poliblogs_embeddings_paraphrase-MiniLM-L3-v2.pkl b/stream_topic/pre_embedded_datasets/Poliblogs/Poliblogs_embeddings_paraphrase-MiniLM-L3-v2.pkl
deleted file mode 100644
index eb8a7fbdf2..0000000000
Binary files a/stream_topic/pre_embedded_datasets/Poliblogs/Poliblogs_embeddings_paraphrase-MiniLM-L3-v2.pkl and /dev/null differ
diff --git a/stream_topic/preprocessed_datasets/20NewsGroups/20NewsGroups.parquet b/stream_topic/preprocessed_datasets/20NewsGroups/20NewsGroups.parquet
deleted file mode 100644
index 5cdb39cb9d..0000000000
--- a/stream_topic/preprocessed_datasets/20NewsGroups/20NewsGroups.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:284f5d36ac8246675b1bc822d367df181dd23486df2dcb1b38843bdf9e71cc96
-size 9325427
diff --git a/stream_topic/preprocessed_datasets/20NewsGroups/20NewsGroups_info.pkl b/stream_topic/preprocessed_datasets/20NewsGroups/20NewsGroups_info.pkl
deleted file mode 100644
index 2387fe7e61..0000000000
Binary files a/stream_topic/preprocessed_datasets/20NewsGroups/20NewsGroups_info.pkl and /dev/null differ
diff --git a/stream_topic/preprocessed_datasets/BBC_News/BBC_News.parquet b/stream_topic/preprocessed_datasets/BBC_News/BBC_News.parquet
deleted file mode 100644
index 671926f5b1..0000000000
--- a/stream_topic/preprocessed_datasets/BBC_News/BBC_News.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:14790b24c1481a191afc34276e2eaba594bec54015b0b6ac583f8c1dc5959cde
-size 2212598
diff --git a/stream_topic/preprocessed_datasets/BBC_News/BBC_News_info.pkl b/stream_topic/preprocessed_datasets/BBC_News/BBC_News_info.pkl
deleted file mode 100644
index 82c47b3f11..0000000000
Binary files a/stream_topic/preprocessed_datasets/BBC_News/BBC_News_info.pkl and /dev/null differ
diff --git a/stream_topic/preprocessed_datasets/DummyDataset/DummyDataset.parquet b/stream_topic/preprocessed_datasets/DummyDataset/DummyDataset.parquet
deleted file mode 100644
index c05bf3af8e..0000000000
--- a/stream_topic/preprocessed_datasets/DummyDataset/DummyDataset.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ce12ce97ae80a5653563e9edccac9bad692a9dab6d5eabf01c264a1ccbed9ec1
-size 2749
diff --git a/stream_topic/preprocessed_datasets/DummyDataset/DummyDataset_info.pkl b/stream_topic/preprocessed_datasets/DummyDataset/DummyDataset_info.pkl
deleted file mode 100644
index 2673f7213a..0000000000
Binary files a/stream_topic/preprocessed_datasets/DummyDataset/DummyDataset_info.pkl and /dev/null differ
diff --git a/stream_topic/preprocessed_datasets/Poliblogs/Poliblogs.parquet b/stream_topic/preprocessed_datasets/Poliblogs/Poliblogs.parquet
deleted file mode 100644
index ac7ec819a8..0000000000
--- a/stream_topic/preprocessed_datasets/Poliblogs/Poliblogs.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5d12c33ae74715ef48451946718f0bea8c57778bdff5a46c53d02f04ee842d38
-size 17288439
diff --git a/stream_topic/preprocessed_datasets/Poliblogs/Poliblogs_info.pkl b/stream_topic/preprocessed_datasets/Poliblogs/Poliblogs_info.pkl
deleted file mode 100644
index 17d325aa98..0000000000
Binary files a/stream_topic/preprocessed_datasets/Poliblogs/Poliblogs_info.pkl and /dev/null differ
diff --git a/stream_topic/preprocessed_datasets/Reddit_GME/Reddit_GME.parquet b/stream_topic/preprocessed_datasets/Reddit_GME/Reddit_GME.parquet
deleted file mode 100644
index c6affce2d1..0000000000
--- a/stream_topic/preprocessed_datasets/Reddit_GME/Reddit_GME.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0ee27de1da46238b9461e7b5957e2add06beb3f25bf1cf21185e93685e25e42a
-size 1611503
diff --git a/stream_topic/preprocessed_datasets/Reddit_GME/Reddit_GME_info.pkl b/stream_topic/preprocessed_datasets/Reddit_GME/Reddit_GME_info.pkl
deleted file mode 100644
index c0d2a762ba..0000000000
Binary files a/stream_topic/preprocessed_datasets/Reddit_GME/Reddit_GME_info.pkl and /dev/null differ
diff --git a/stream_topic/preprocessed_datasets/Reuters/Reuters.parquet b/stream_topic/preprocessed_datasets/Reuters/Reuters.parquet
deleted file mode 100644
index e9f74b446b..0000000000
--- a/stream_topic/preprocessed_datasets/Reuters/Reuters.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:15bf3c8ec1f9d92b995bdd99b07c63aa9bd797a7621886bdcb211379d3b6d1f6
-size 3497440
diff --git a/stream_topic/preprocessed_datasets/Reuters/Reuters_info.pkl b/stream_topic/preprocessed_datasets/Reuters/Reuters_info.pkl
deleted file mode 100644
index dd06cc1a5b..0000000000
Binary files a/stream_topic/preprocessed_datasets/Reuters/Reuters_info.pkl and /dev/null differ
diff --git a/stream_topic/preprocessed_datasets/Spotify/Spotify.parquet b/stream_topic/preprocessed_datasets/Spotify/Spotify.parquet
deleted file mode 100644
index 911ad5f194..0000000000
--- a/stream_topic/preprocessed_datasets/Spotify/Spotify.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b7d92e0fd70253d94733c5e8c642901993172fa7f12b0d6f0e6b1bf3087d9d98
-size 5386411
diff --git a/stream_topic/preprocessed_datasets/Spotify/Spotify_info.pkl b/stream_topic/preprocessed_datasets/Spotify/Spotify_info.pkl
deleted file mode 100644
index d8620ed958..0000000000
Binary files a/stream_topic/preprocessed_datasets/Spotify/Spotify_info.pkl and /dev/null differ
diff --git a/stream_topic/preprocessed_datasets/Spotify_least_popular/Spotify_least_popular.parquet b/stream_topic/preprocessed_datasets/Spotify_least_popular/Spotify_least_popular.parquet
deleted file mode 100644
index 7d0e3e1026..0000000000
--- a/stream_topic/preprocessed_datasets/Spotify_least_popular/Spotify_least_popular.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e15a8199d6a145ef5fa6937278296a64a82f55b5532e774776afbb8a6e100845
-size 2783922
diff --git a/stream_topic/preprocessed_datasets/Spotify_least_popular/Spotify_least_popular_info.pkl b/stream_topic/preprocessed_datasets/Spotify_least_popular/Spotify_least_popular_info.pkl
deleted file mode 100644
index 15c0989337..0000000000
Binary files a/stream_topic/preprocessed_datasets/Spotify_least_popular/Spotify_least_popular_info.pkl and /dev/null differ
diff --git a/stream_topic/preprocessed_datasets/Spotify_most_popular/Spotify_most_popular.parquet b/stream_topic/preprocessed_datasets/Spotify_most_popular/Spotify_most_popular.parquet
deleted file mode 100644
index 0284190838..0000000000
--- a/stream_topic/preprocessed_datasets/Spotify_most_popular/Spotify_most_popular.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f1ab92c211a7eace312a78e460fa436592af99ffa6379ee553457f03ba0e2091
-size 3026031
diff --git a/stream_topic/preprocessed_datasets/Spotify_most_popular/Spotify_most_popular_info.pkl b/stream_topic/preprocessed_datasets/Spotify_most_popular/Spotify_most_popular_info.pkl
deleted file mode 100644
index 2de999c5dc..0000000000
Binary files a/stream_topic/preprocessed_datasets/Spotify_most_popular/Spotify_most_popular_info.pkl and /dev/null differ
diff --git a/stream_topic/preprocessed_datasets/Stocktwits_GME/Stocktwits_GME.parquet b/stream_topic/preprocessed_datasets/Stocktwits_GME/Stocktwits_GME.parquet
deleted file mode 100644
index 3b1ba90507..0000000000
--- a/stream_topic/preprocessed_datasets/Stocktwits_GME/Stocktwits_GME.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0e6b29a2d808d63b018bd9ace56e37d9f2f66681e5497e38200755549679683d
-size 1436738
diff --git a/stream_topic/preprocessed_datasets/Stocktwits_GME/Stocktwits_GME_info.pkl b/stream_topic/preprocessed_datasets/Stocktwits_GME/Stocktwits_GME_info.pkl
deleted file mode 100644
index 15b51c4837..0000000000
Binary files a/stream_topic/preprocessed_datasets/Stocktwits_GME/Stocktwits_GME_info.pkl and /dev/null differ
diff --git a/stream_topic/preprocessed_datasets/Stocktwits_GME_large/Stocktwits_GME_large.parquet b/stream_topic/preprocessed_datasets/Stocktwits_GME_large/Stocktwits_GME_large.parquet
deleted file mode 100644
index 31b28a5296..0000000000
--- a/stream_topic/preprocessed_datasets/Stocktwits_GME_large/Stocktwits_GME_large.parquet
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c7bee4045df8c64638a2e1b3e82de350dacb2292f5d23cf08537f92a86a086a7
-size 30154844
diff --git a/stream_topic/preprocessed_datasets/Stocktwits_GME_large/Stocktwits_GME_large_info.pkl b/stream_topic/preprocessed_datasets/Stocktwits_GME_large/Stocktwits_GME_large_info.pkl
deleted file mode 100644
index 13af01f292..0000000000
Binary files a/stream_topic/preprocessed_datasets/Stocktwits_GME_large/Stocktwits_GME_large_info.pkl and /dev/null differ
diff --git a/stream_topic/utils/data_downloader.py b/stream_topic/utils/data_downloader.py
new file mode 100644
index 0000000000..537ee8e0a1
--- /dev/null
+++ b/stream_topic/utils/data_downloader.py
@@ -0,0 +1,477 @@
+import importlib.util
+import os
+import pickle
+from urllib.parse import urljoin
+
+import pandas as pd
+import requests
+from loguru import logger
+
+PACKAGE_NAME = "stream_topic"
+
+
+class DataDownloader:
+
+    def __init__(self, name=None, language="en"):
+
+        self.name = name
+        self.dataframe = None
+        self.embeddings = None
+        self.bow = None
+        self.tfidf = None
+        self.tokens = None
+        self.texts = None
+        self.labels = None
+        self.language = language
+        self.preprocessing_steps = self.default_preprocessing_steps()
+
+    def default_preprocessing_steps(self):
+        return {
+            "remove_stopwords": False,
+            "lowercase": True,
+            "remove_punctuation": False,
+            "remove_numbers": False,
+            "lemmatize": False,
+            "stem": False,
+            "expand_contractions": True,
+            "remove_html_tags": True,
+            "remove_special_chars": True,
+            "remove_accents": False,
+            "custom_stopwords": set(),
+            "detokenize": False,
+        }
+
+    def get_package_dataset_path(self, name):
+        """
+        Get the path to the package dataset.
+
+        Parameters
+        ----------
+        name : str
+            Name of the dataset.
+
+        Returns
+        -------
+        str
+            Path to the dataset.
+        """
+        # Get the location of the installed package
+        spec = importlib.util.find_spec(PACKAGE_NAME)
+        package_root_dir = os.path.dirname(spec.origin)
+        if package_root_dir is None:
+            raise ImportError(f"Cannot find the package '{PACKAGE_NAME}'")
+
+        # Construct the full path to the dataset
+        dataset_path = os.path.join(
+            package_root_dir, "stream_topic_data/preprocessed_datasets", name
+        )
+
+        return dataset_path
+
+    def has_embeddings(
+        self, embedding_model_name, path=None, file_name=None, source="github"
+    ):
+        """
+        Check if embeddings are available for the dataset.
+
+        Parameters
+        ----------
+        embedding_model_name : str
+            Name of the embedding model used.
+        path : str, optional
+            Path where embeddings are expected to be saved.
+        file_name : str, optional
+            File name for the embeddings.
+
+        Returns
+        -------
+        bool
+            True if embeddings are available, False otherwise.
+        """
+        if source == "github" and path is None:
+            BASE_URL = "https://raw.githubusercontent.com/mkumar73/stream_topic_data/main/datasets/pre_embedded_datasets/"
+            git_pkl_path = urljoin(
+                BASE_URL,
+                os.path.join(
+                    self.name, f"{self.name}_embeddings_{embedding_model_name}.pkl"
+                ).replace(os.sep, "/"),
+            )
+            return url_exists(git_pkl_path)
+
+        elif path is None:
+            path = self.get_package_embeddings_path(self.name)
+            embeddings_file = (
+                urljoin(path, file_name)
+                if file_name
+                else os.path.join(
+                    path, f"{self.name}_embeddings_{embedding_model_name}.pkl"
+                )
+            )
+            return os.path.exists(embeddings_file)
+
+    def save_embeddings(
+        self, embeddings, embedding_model_name, path=None, file_name=None
+    ):
+        """
+        Save embeddings for the dataset.
+
+        Parameters
+        ----------
+        embeddings : np.ndarray
+            Embeddings to save.
+        embedding_model_name : str
+            Name of the embedding model used.
+        path : str, optional
+            Path to save the embeddings.
+        file_name : str, optional
+            File name for the embeddings.
+        """
+        try:
+            if path is None:
+                path = self.get_package_embeddings_path(self.name)
+
+            logger.info(f"Saving embeddings to path: {path}")
+
+            if not os.path.exists(path):
+                os.makedirs(path)
+                logger.info(f"Created directory: {path}")
+
+            embeddings_file = (
+                os.path.join(path, file_name)
+                if file_name
+                else os.path.join(
+                    path, f"{self.name}_embeddings_{embedding_model_name}.pkl"
+                )
+            )
+
+            logger.info(f"Embeddings file path: {embeddings_file}")
+
+            with open(embeddings_file, "wb") as file:
+                pickle.dump(embeddings, file)
+
+            logger.info("Embeddings saved successfully.")
+
+        except PermissionError as e:
+            logger.error(f"PermissionError: {e}")
+        except Exception as e:
+            logger.error(f"An error occurred: {e}")
+
+    def get_embeddings(
+        self, embedding_model_name, path=None, file_name=None, source="github"
+    ):
+        """
+        Get embeddings for the dataset.
+
+        Parameters
+        ----------
+        embedding_model_name : str
+            Name of the embedding model to use.
+        path : str, optional
+            Path to save the embeddings.
+        file_name : str, optional
+            File name for the embeddings.
+
+        Returns
+        -------
+        np.ndarray
+            Embeddings for the dataset.
+        """
+        if source == "github" and path is None:
+            # logger.info(f"Fetching dataset from github")
+            self.load_custom_dataset_from_url(
+                self.name, embeddings=True, embedding_model_name=embedding_model_name
+            )
+
+        elif not self.has_embeddings(embedding_model_name, path, file_name):
+            raise ValueError(
+                "Embeddings are not available. Run the encoding process first or load embeddings."
+            )
+
+        # logger.info("--- Loading pre-computed document embeddings ---")
+
+        if self.embeddings is None:
+            if path is None:
+                path = self.get_package_embeddings_path(self.name)
+            embeddings_file = (
+                os.path.join(path, file_name)
+                if file_name
+                else os.path.join(
+                    path, f"{self.name}_embeddings_{embedding_model_name}.pkl"
+                )
+            )
+            with open(embeddings_file, "rb") as file:
+                self.embeddings = pickle.load(file)
+
+        return self.embeddings
+
+    def get_package_embeddings_path(self, name):
+        """
+        Get the path to the package embeddings.
+
+        Parameters
+        ----------
+        name : str
+            Name of the dataset.
+
+        Returns
+        -------
+        str
+            Path to the embeddings.
+        """
+        # Get the location of the installed package
+        spec = importlib.util.find_spec(PACKAGE_NAME)
+        package_root_dir = os.path.dirname(spec.origin)
+        if package_root_dir is None:
+            raise ImportError(f"Cannot find the package '{PACKAGE_NAME}'")
+
+        # Construct the full path to the dataset
+        embedding_path = os.path.join(
+            package_root_dir, "stream_topic_data", "pre_embedded_datasets", name
+        )
+
+        return embedding_path
+
+    def load_custom_dataset_from_folder(self, dataset_path):
+        """
+        Load a custom dataset from a folder.
+
+        Parameters
+        ----------
+        dataset_path : str
+            Path to the dataset folder.
+        """
+        parquet_path = os.path.join(dataset_path, f"{self.name}.parquet")
+        if os.path.exists(parquet_path):
+            self.load_dataset_from_parquet(parquet_path)
+        else:
+            documents_path = os.path.join(dataset_path, "corpus.txt")
+            labels_path = os.path.join(dataset_path, "labels.txt")
+
+            with open(documents_path, encoding="utf-8") as f:
+                documents = f.readlines()
+
+            with open(labels_path, encoding="utf-8") as f:
+                labels = f.readlines()
+
+            self.dataframe = pd.DataFrame(
+                {
+                    "text": [doc.strip() for doc in documents],
+                    "labels": [label.strip() for label in labels],
+                }
+            )
+
+            self.dataframe["tokens"] = self.dataframe["text"].apply(
+                lambda x: x.split())
+            self.texts = self.dataframe["text"].tolist()
+            self.labels = self.dataframe["labels"].tolist()
+
+    def load_custom_dataset_from_url(
+        self, dataset_path=None, embeddings=False, embedding_model_name=None
+    ):
+        """
+        Load a custom dataset from a folder.
+
+        Parameters
+        ----------
+        dataset_path : str
+            Path to the dataset folder.
+        """
+        if embeddings:
+            if not embedding_model_name:
+                raise ValueError(
+                    "Please provide the embedding model name to load embeddings."
+                )
+            BASE_URL = "https://raw.githubusercontent.com/mkumar73/stream_topic_data/main/datasets/pre_embedded_datasets/"
+            git_pkl_path = urljoin(
+                BASE_URL,
+                os.path.join(
+                    self.name, f"{self.name}_embeddings_{embedding_model_name}.pkl"
+                ).replace(os.sep, "/"),
+            )
+            data_home = get_data_home()
+            save_dir = os.path.join(
+                data_home, "pre_embedded_datasets", self.name)
+
+            if not os.path.exists(save_dir):
+                os.makedirs(save_dir)
+            local_pkl_path = os.path.join(
+                save_dir, f"{self.name}_embeddings_{embedding_model_name}.pkl"
+            )
+
+            if url_exists(git_pkl_path):
+                logger.info(f"Downloading embeddings from github")
+                download_file_from_github(git_pkl_path, local_pkl_path)
+                logger.info(
+                    f"Embeddings  downloaded successfully at ~/stream_topic_data/"
+                )
+
+        else:
+            BASE_URL = "https://raw.githubusercontent.com/mkumar73/stream_topic_data/main/datasets/preprocessed_datasets/"
+            git_parquet_path = urljoin(
+                BASE_URL,
+                os.path.join(self.name, f"{self.name}.parquet").replace(
+                    os.sep, "/"),
+            )
+            git_pkl_path = urljoin(
+                BASE_URL,
+                os.path.join(self.name, f"{self.name}_info.pkl").replace(
+                    os.sep, "/"),
+            )
+
+            data_home = get_data_home()
+            save_dir = os.path.join(
+                data_home, "preprocessed_datasets", self.name)
+
+            if not os.path.exists(save_dir):
+                os.makedirs(save_dir)
+            local_parquet_path = os.path.join(save_dir, f"{self.name}.parquet")
+            local_pkl_path = os.path.join(save_dir, f"{self.name}_info.pkl")
+
+            if url_exists(git_parquet_path):
+                logger.info(f"Downloading dataset from github")
+                download_file_from_github(git_parquet_path, local_parquet_path)
+                logger.info(
+                    f"Dataset downloaded successfully at ~/stream_topic_data/")
+                self.load_dataset_from_parquet(local_parquet_path)
+            else:
+                # TODO: need to be refactored to include githb url for corpus and labels
+                documents_path = os.path.join(dataset_path, "corpus.txt")
+                labels_path = os.path.join(dataset_path, "labels.txt")
+
+                with open(documents_path, encoding="utf-8") as f:
+                    documents = f.readlines()
+
+                with open(labels_path, encoding="utf-8") as f:
+                    labels = f.readlines()
+
+                self.dataframe = pd.DataFrame(
+                    {
+                        "text": [doc.strip() for doc in documents],
+                        "labels": [label.strip() for label in labels],
+                    }
+                )
+
+                self.dataframe["tokens"] = self.dataframe["text"].apply(
+                    lambda x: x.split()
+                )
+                self.texts = self.dataframe["text"].tolist()
+                self.labels = self.dataframe["labels"].tolist()
+
+            if url_exists(git_pkl_path):
+                logger.info(f"Downloading dataset info from github")
+                download_file_from_github(git_pkl_path, local_pkl_path)
+                logger.info(
+                    f"Dataset info downloaded successfully at ~/stream_topic_data/"
+                )
+
+    def save_word_embeddings(
+        self, word_embeddings, model_name, path=None, file_name=None
+    ):
+        """
+        Save word embeddings for the dataset.
+
+        Parameters
+        ----------
+        word_embeddings : dict
+            Word embeddings to save.
+        model_name : str
+            Name of the pre-trained model.
+        """
+        self.save_embeddings(
+            embeddings=word_embeddings,
+            embedding_model_name=model_name,
+            path=path,
+            file_name=file_name,
+        )
+
+    def _save_to_parquet(self, save_dir, dataset_name):
+        """
+        Save the dataset to a Parquet file.
+
+        Parameters
+        ----------
+        save_dir : str
+            Directory to save the dataset.
+        dataset_name : str
+            Name of the dataset.
+        """
+        save_path = os.path.join(save_dir, f"{dataset_name}.parquet")
+        self.dataframe.to_parquet(save_path, index=False)
+
+    def load_dataset_from_parquet(self, load_path):
+        """
+        Load a dataset from a Parquet file.
+
+        Parameters
+        ----------
+        load_path : str
+            Path to the Parquet file.
+        """
+        if not os.path.exists(load_path):
+            raise FileNotFoundError(f"File {load_path} does not exist.")
+        self.dataframe = pd.read_parquet(load_path)
+        self.dataframe["tokens"] = self.dataframe["text"].apply(
+            lambda x: x.split())
+        self.texts = self.dataframe["text"].tolist()
+        self.labels = self.dataframe["labels"].tolist()
+
+
+def get_data_home(data_home=None):
+    """
+    Get the data home directory.
+
+    Parameters
+    ----------
+    data_home : str, optional
+        Path to the data home directory, defaults to None.
+
+
+    Notes
+    -----
+    If environment variable STREAM_TOPIC_DATA is not set, the default path is `~/stream_topic_data`.
+
+    Returns
+    -------
+    str
+        Path to the data home directory.
+
+    """
+    spec = importlib.util.find_spec(PACKAGE_NAME)
+    package_root_dir = os.path.dirname(spec.origin)
+    if data_home is None:
+        data_home = os.environ.get(
+            "STREAM_TOPIC_DATA", os.path.join(
+                package_root_dir, "stream_topic_data")
+        )
+    # data_home = os.path.expanduser(data_home)
+    if not os.path.exists(data_home):
+        os.makedirs(data_home)
+
+    return data_home
+
+
+def url_exists(url):
+    try:
+        response = requests.head(url)
+        return response.status_code == 200
+    except requests.RequestException:
+        return False
+
+
+def download_file_from_github(url: str, save_dir: str):
+    """
+    Downloads a file from a GitHub repository.
+
+    Parameters
+    ----------
+    url : str
+        URL of the file.
+    save_path : str
+        Path to save the file.
+    """
+    response = requests.get(url)
+    response.raise_for_status()  # Check if the download was successful
+
+    with open(save_dir, "wb") as file:
+        file.write(response.content)
+    # logger.info(f"File downloaded and saved to {save_dir}")
diff --git a/stream_topic/utils/dataset.py b/stream_topic/utils/dataset.py
index 13abacd7ab..6dd81020da 100644
--- a/stream_topic/utils/dataset.py
+++ b/stream_topic/utils/dataset.py
@@ -1,21 +1,21 @@
 import os
 import pickle
 import re
-import importlib.util
+
 import gensim.downloader as api
 import numpy as np
 import pandas as pd
 from loguru import logger
 from sentence_transformers import SentenceTransformer
-
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 from torch.utils.data import Dataset, random_split
 
 from ..commons.load_steps import load_model_preprocessing_steps
 from ..preprocessor import TextPreprocessor
+from .data_downloader import DataDownloader, get_data_home
 
 
-class TMDataset(Dataset):
+class TMDataset(Dataset, DataDownloader):
     """
     Topic Modeling Dataset containing methods to fetch and preprocess text data.
 
@@ -79,14 +79,6 @@ class TMDataset(Dataset):
     def __init__(self, name=None, language="en"):
         super().__init__()
 
-        self.available_datasets = self.get_dataset_list()
-        if name is not None and name not in self.available_datasets:
-            logger.error(
-                f"Dataset {name} not found. Available datasets: {self.available_datasets}"
-            )
-            raise ValueError(
-                f"Dataset {name} not found. Available datasets: {self.available_datasets}"
-            )
         self.name = name
         self.dataframe = None
         self.embeddings = None
@@ -98,37 +90,7 @@ def __init__(self, name=None, language="en"):
         self.language = language
         self.preprocessing_steps = self.default_preprocessing_steps()
 
-    def get_dataset_list(self):
-        """
-        Get the list of available datasets.
-
-        Returns
-        -------
-        list of str
-            List of available datasets.
-        """
-        package_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-        dataset_path = os.path.join(package_path, "preprocessed_datasets")
-        datasets = os.listdir(dataset_path)
-        return datasets
-
-    def default_preprocessing_steps(self):
-        return {
-            "remove_stopwords": False,
-            "lowercase": True,
-            "remove_punctuation": False,
-            "remove_numbers": False,
-            "lemmatize": False,
-            "stem": False,
-            "expand_contractions": True,
-            "remove_html_tags": True,
-            "remove_special_chars": True,
-            "remove_accents": False,
-            "custom_stopwords": set(),
-            "detokenize": False,
-        }
-
-    def fetch_dataset(self, name: str, dataset_path=None):
+    def fetch_dataset(self, name: str, dataset_path=None, source: str = "github"):
         """
         Fetch a dataset by name.
 
@@ -138,21 +100,16 @@ def fetch_dataset(self, name: str, dataset_path=None):
             Name of the dataset to fetch.
         dataset_path : str, optional
             Path to the dataset directory.
+        source : str, optional
+            Source of the dataset, by default 'github'. Use 'local' if dataset is available in locally. Then, provide the dataset_path.
         """
-        if name not in self.available_datasets:
-            logger.error(
-                f"Dataset {name} not found. Available datasets: {self.available_datasets}"
-            )
-            raise ValueError(
-                f"Dataset {name} not found. Available datasets: {self.available_datasets}"
-            )
 
         if self.name is not None:
             logger.info(
                 f"Dataset name already provided while instantiating the class: {self.name}"
             )
             logger.info(
-                f"Overwriting the dataset name with the provided name in fetch_dataset: {name}"
+                f"Overwriting the dataset name with the name provided in fetch_dataset: {name}"
             )
             self.name = name
             logger.info(f"Fetching dataset: {name}")
@@ -160,17 +117,36 @@ def fetch_dataset(self, name: str, dataset_path=None):
             self.name = name
             logger.info(f"Fetching dataset: {name}")
 
-        if dataset_path is None:
-            dataset_path = self.get_package_dataset_path(name)
-        if os.path.exists(dataset_path):
+        if source == "github" and dataset_path is None:
+            # logger.info(f"Fetching dataset from github")
+            self.load_custom_dataset_from_url(name)
+            data_home = get_data_home()
+            dataset_path = os.path.join(
+                data_home, "preprocessed_datasets", name)
+            self.info = self.get_info(dataset_path)
+        elif source == "local" and dataset_path is not None:
+            logger.info(f"Fetching dataset from local path")
             self.load_custom_dataset_from_folder(dataset_path)
-            logger.info(f"Dataset loaded successfully from {dataset_path}")
+            self.info = self.get_info(dataset_path)
+        elif dataset_path is None:
+            logger.info(f"Fetching dataset from package path")
+            dataset_path = self.get_package_dataset_path(name)
+            if os.path.exists(dataset_path):
+                self.load_custom_dataset_from_folder(dataset_path)
+                logger.info(f"Dataset loaded successfully from {dataset_path}")
+            else:
+                logger.error(f"Dataset path {dataset_path} does not exist.")
+                raise ValueError(
+                    f"Dataset path {dataset_path} does not exist.")
+            # self._load_data_to_dataframe()
+            self.info = self.get_info(dataset_path)
         else:
-            logger.error(f"Dataset path {dataset_path} does not exist.")
-            raise ValueError(f"Dataset path {dataset_path} does not exist.")
-        # self._load_data_to_dataframe()
-
-        self.info = self.get_info(dataset_path)
+            logger.error(
+                f"Dataset path {dataset_path} does not exist. Please provide the correct path or use the exiting dataset."
+            )
+            raise ValueError(
+                f"Dataset path {dataset_path} does not exist. Please provide the correct path or use the exiting dataset."
+            )
 
     def _load_data_to_dataframe(self):
         """
@@ -182,180 +158,11 @@ def _load_data_to_dataframe(self):
                 "labels": self.get_labels(),
             }
         )
-        self.dataframe["text"] = [" ".join(words) for words in self.dataframe["tokens"]]
+        self.dataframe["text"] = [" ".join(words)
+                                  for words in self.dataframe["tokens"]]
         self.texts = self.dataframe["text"].tolist()
         self.labels = self.dataframe["labels"].tolist()
 
-    def get_package_dataset_path(self, name):
-        """
-        Get the path to the package dataset.
-
-        Parameters
-        ----------
-        name : str
-            Name of the dataset.
-
-        Returns
-        -------
-        str
-            Path to the dataset.
-        """
-        # Get the location of the installed package
-        package_name = "stream_topic"
-        spec = importlib.util.find_spec(package_name)
-        if spec is None:
-            raise ImportError(f"Cannot find the package '{package_name}'")
-
-        package_root_dir = os.path.dirname(spec.origin)
-
-        # Construct the full path to the dataset
-        dataset_path = os.path.join(package_root_dir, "preprocessed_datasets", name)
-
-        return dataset_path
-
-    def has_embeddings(self, embedding_model_name, path=None, file_name=None):
-        """
-        Check if embeddings are available for the dataset.
-
-        Parameters
-        ----------
-        embedding_model_name : str
-            Name of the embedding model used.
-        path : str, optional
-            Path where embeddings are expected to be saved.
-        file_name : str, optional
-            File name for the embeddings.
-
-        Returns
-        -------
-        bool
-            True if embeddings are available, False otherwise.
-        """
-        if path is None:
-            path = self.get_package_embeddings_path(self.name)
-        embeddings_file = (
-            os.path.join(path, file_name)
-            if file_name
-            else os.path.join(
-                path, f"{self.name}_embeddings_{embedding_model_name}.pkl"
-            )
-        )
-        return os.path.exists(embeddings_file)
-
-    def save_embeddings(
-        self, embeddings, embedding_model_name, path=None, file_name=None
-    ):
-        """
-        Save embeddings for the dataset.
-
-        Parameters
-        ----------
-        embeddings : np.ndarray
-            Embeddings to save.
-        embedding_model_name : str
-            Name of the embedding model used.
-        path : str, optional
-            Path to save the embeddings.
-        file_name : str, optional
-            File name for the embeddings.
-        """
-        try:
-            if path is None:
-                path = self.get_package_embeddings_path(self.name)
-
-            logger.info(f"Saving embeddings to path: {path}")
-
-            if not os.path.exists(path):
-                os.makedirs(path)
-                logger.info(f"Created directory: {path}")
-
-            embeddings_file = (
-                os.path.join(path, file_name)
-                if file_name
-                else os.path.join(
-                    path, f"{self.name}_embeddings_{embedding_model_name}.pkl"
-                )
-            )
-
-            logger.info(f"Embeddings file path: {embeddings_file}")
-
-            with open(embeddings_file, "wb") as file:
-                pickle.dump(embeddings, file)
-
-            logger.info("Embeddings saved successfully.")
-
-        except PermissionError as e:
-            logger.error(f"PermissionError: {e}")
-        except Exception as e:
-            logger.error(f"An error occurred: {e}")
-
-    def get_embeddings(self, embedding_model_name, path=None, file_name=None):
-        """
-        Get embeddings for the dataset.
-
-        Parameters
-        ----------
-        embedding_model_name : str
-            Name of the embedding model to use.
-        path : str, optional
-            Path to save the embeddings.
-        file_name : str, optional
-            File name for the embeddings.
-
-        Returns
-        -------
-        np.ndarray
-            Embeddings for the dataset.
-        """
-        if not self.has_embeddings(embedding_model_name, path, file_name):
-            raise ValueError(
-                "Embeddings are not available. Run the encoding process first or load embeddings."
-            )
-
-        # logger.info("--- Loading pre-computed document embeddings ---")
-
-        if self.embeddings is None:
-            if path is None:
-                path = self.get_package_embeddings_path(self.name)
-            embeddings_file = (
-                os.path.join(path, file_name)
-                if file_name
-                else os.path.join(
-                    path, f"{self.name}_embeddings_{embedding_model_name}.pkl"
-                )
-            )
-            with open(embeddings_file, "rb") as file:
-                self.embeddings = pickle.load(file)
-
-        return self.embeddings
-
-    def get_package_embeddings_path(self, name):
-        """
-        Get the path to the package embeddings.
-
-        Parameters
-        ----------
-        name : str
-            Name of the dataset.
-
-        Returns
-        -------
-        str
-            Path to the embeddings.
-        """
-        # Get the location of the installed package
-        package_name = "stream_topic"
-        spec = importlib.util.find_spec(package_name)
-        if spec is None:
-            raise ImportError(f"Cannot find the package '{package_name}'")
-
-        package_root_dir = os.path.dirname(spec.origin)
-
-        # Construct the full path to the dataset
-        embedding_path = os.path.join(package_root_dir, "pre_embedded_datasets", name)
-
-        return embedding_path
-
     def create_load_save_dataset(
         self,
         data,
@@ -390,18 +197,21 @@ def create_load_save_dataset(
         """
         if isinstance(data, pd.DataFrame):
             if doc_column is None:
-                raise ValueError("doc_column must be specified for DataFrame input")
+                raise ValueError(
+                    "doc_column must be specified for DataFrame input")
             documents = [
                 self.clean_text(str(row[doc_column])) for _, row in data.iterrows()
             ]
             labels = (
-                data[label_column].tolist() if label_column else [None] * len(documents)
+                data[label_column].tolist() if label_column else [
+                    None] * len(documents)
             )
         elif isinstance(data, list):
             documents = [self.clean_text(doc) for doc in data]
             labels = [None] * len(documents)
         else:
-            raise TypeError("data must be a pandas DataFrame or a list of documents")
+            raise TypeError(
+                "data must be a pandas DataFrame or a list of documents")
 
         # Initialize preprocessor with kwargs
         preprocessor = TextPreprocessor(**kwargs)
@@ -422,9 +232,9 @@ def create_load_save_dataset(
             logger.info(f"Creating directory: {save_dir}")
             os.makedirs(save_dir)
 
-        parquet_path = os.path.join(save_dir, f"{dataset_name}.parquet")
-        self.dataframe.to_parquet(parquet_path)
-        logger.info(f"Dataset saved to {parquet_path}")
+        local_parquet_path = os.path.join(save_dir, f"{dataset_name}.parquet")
+        self.dataframe.to_parquet(local_parquet_path)
+        logger.info(f"Dataset saved to {local_parquet_path}")
 
         # Save dataset information
         dataset_info = {
@@ -440,11 +250,6 @@ def create_load_save_dataset(
         with open(info_path, "wb") as info_file:
             pickle.dump(dataset_info, info_file)
         logger.info(f"Dataset info saved to {info_path}")
-
-        self.available_datasets.append(dataset_name)
-        logger.info(
-            f"Dataset name appended to avaliable datasets list: {self.available_datasets}"
-        )
         # return preprocessor
 
     def preprocess(self, model_type=None, custom_stopwords=None, **preprocessing_steps):
@@ -515,7 +320,8 @@ def preprocess(self, model_type=None, custom_stopwords=None, **preprocessing_ste
                     }
                 )
             except Exception as e:
-                raise RuntimeError(f"Error in dataset preprocessing: {e}") from e
+                raise RuntimeError(
+                    f"Error in dataset preprocessing: {e}") from e
         self.update_preprocessing_steps(**filtered_steps)
 
     def update_preprocessing_steps(self, **preprocessing_steps):
@@ -561,13 +367,13 @@ def get_info(self, dataset_path=None):
             raise ValueError(f"Dataset path {dataset_path} does not exist.")
 
         info_path = os.path.join(dataset_path, f"{self.name}_info.pkl")
-        if not os.path.exists(info_path):
-            raise FileNotFoundError(f"Dataset info file {info_path} does not exist.")
-
-        with open(info_path, "rb") as info_file:
-            dataset_info = pickle.load(info_file)
-
-        return dataset_info
+        if os.path.exists(info_path):
+            with open(info_path, "rb") as info_file:
+                dataset_info = pickle.load(info_file)
+            return dataset_info
+        else:
+            raise FileNotFoundError(
+                f"Dataset info file {info_path} does not exist.")
 
     @staticmethod
     def clean_text(text):
@@ -627,39 +433,6 @@ def __getitem__(self, idx):
             item["tfidf"] = self.tfidf[idx]
         return item
 
-    def load_custom_dataset_from_folder(self, dataset_path):
-        """
-        Load a custom dataset from a folder.
-
-        Parameters
-        ----------
-        dataset_path : str
-            Path to the dataset folder.
-        """
-        parquet_path = os.path.join(dataset_path, f"{self.name}.parquet")
-        if os.path.exists(parquet_path):
-            self.load_dataset_from_parquet(parquet_path)
-        else:
-            documents_path = os.path.join(dataset_path, "corpus.txt")
-            labels_path = os.path.join(dataset_path, "labels.txt")
-
-            with open(documents_path, encoding="utf-8") as f:
-                documents = f.readlines()
-
-            with open(labels_path, encoding="utf-8") as f:
-                labels = f.readlines()
-
-            self.dataframe = pd.DataFrame(
-                {
-                    "text": [doc.strip() for doc in documents],
-                    "labels": [label.strip() for label in labels],
-                }
-            )
-
-            self.dataframe["tokens"] = self.dataframe["text"].apply(lambda x: x.split())
-            self.texts = self.dataframe["text"].tolist()
-            self.labels = self.dataframe["labels"].tolist()
-
     def get_corpus(self):
         """
         Get the corpus (tokens) from the dataframe.
@@ -752,7 +525,8 @@ def get_bow(self, **kwargs):
         """
         corpus = [" ".join(tokens) for tokens in self.get_corpus()]
         vectorizer = CountVectorizer(**kwargs)
-        self.bow = vectorizer.fit_transform(corpus).toarray().astype(np.float32)
+        self.bow = vectorizer.fit_transform(
+            corpus).toarray().astype(np.float32)
         return self.bow, vectorizer.get_feature_names_out()
 
     def get_tfidf(self, **kwargs):
@@ -792,26 +566,6 @@ def has_word_embeddings(self, model_name):
         """
         return self.has_embeddings(model_name, "word_embeddings")
 
-    def save_word_embeddings(
-        self, word_embeddings, model_name, path=None, file_name=None
-    ):
-        """
-        Save word embeddings for the dataset.
-
-        Parameters
-        ----------
-        word_embeddings : dict
-            Word embeddings to save.
-        model_name : str
-            Name of the pre-trained model.
-        """
-        self.save_embeddings(
-            embeddings=word_embeddings,
-            embedding_model_name=model_name,
-            path=path,
-            file_name=file_name,
-        )
-
     def get_word_embeddings(self, model_name="glove-wiki-gigaword-100", vocab=None):
         """
         Get the word embeddings for the vocabulary using a pre-trained model.
@@ -845,7 +599,8 @@ def get_word_embeddings(self, model_name="glove-wiki-gigaword-100", vocab=None):
             # Load pre-trained model
             model = api.load(model_name)
 
-            embeddings = {word: model[word] for word in vocabulary if word in model}
+            embeddings = {word: model[word]
+                          for word in vocabulary if word in model}
 
         if model_name == "paraphrase-MiniLM-L3-v2":
             model = SentenceTransformer(model_name)
@@ -854,40 +609,11 @@ def get_word_embeddings(self, model_name="glove-wiki-gigaword-100", vocab=None):
                 vocabulary, convert_to_tensor=True, show_progress_bar=True
             )
 
-            embeddings = {word: embeddings[i] for i, word in enumerate(vocabulary)}
+            embeddings = {word: embeddings[i]
+                          for i, word in enumerate(vocabulary)}
 
             assert len(embeddings) == len(
                 vocabulary
             ), "Embeddings and vocabulary length mismatch"
 
         return embeddings
-
-    def _save_to_parquet(self, save_dir, dataset_name):
-        """
-        Save the dataset to a Parquet file.
-
-        Parameters
-        ----------
-        save_dir : str
-            Directory to save the dataset.
-        dataset_name : str
-            Name of the dataset.
-        """
-        save_path = os.path.join(save_dir, f"{dataset_name}.parquet")
-        self.dataframe.to_parquet(save_path, index=False)
-
-    def load_dataset_from_parquet(self, load_path):
-        """
-        Load a dataset from a Parquet file.
-
-        Parameters
-        ----------
-        load_path : str
-            Path to the Parquet file.
-        """
-        if not os.path.exists(load_path):
-            raise FileNotFoundError(f"File {load_path} does not exist.")
-        self.dataframe = pd.read_parquet(load_path)
-        self.dataframe["tokens"] = self.dataframe["text"].apply(lambda x: x.split())
-        self.texts = self.dataframe["text"].tolist()
-        self.labels = self.dataframe["labels"].tolist()