diff --git a/.gitignore b/.gitignore index ded9b01736..0ba0568bed 100644 --- a/.gitignore +++ b/.gitignore @@ -183,4 +183,11 @@ post-checkout post-commit post-merge pre-push -docs/notebooks/lightning_logs/* \ No newline at end of file +docs/notebooks/lightning_logs/* +docs/notebooks/lightning_logs +docs/notebooks/data +docs/notebooks/data/* +docs/notebooks/embeddings +docs/notebooks/embeddings/* +docs/notebooks/checkpoints +docs/notebooks/checkpoints/* \ No newline at end of file diff --git a/docs/notebooks/datasets.ipynb b/docs/notebooks/datasets.ipynb index b4af4de829..764e080c66 100644 --- a/docs/notebooks/datasets.ipynb +++ b/docs/notebooks/datasets.ipynb @@ -4,8 +4,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/develop/docs/notebooks/datasets.ipynb)\n", - "[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/develop/docs/notebooks/datasets.ipynb)\n", + "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/main/docs/notebooks/datasets.ipynb)\n", + "[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/main/docs/notebooks/datasets.ipynb)\n", "\n", "# Datasets" ] @@ -33,13 +33,25 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, + "outputs": [], + "source": [ + "# uncomment the below line if running in Colab\n", + "# package neeeds to be installed for the notebook to run\n", + "\n", + "# ! pip install -U stream_topic" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/opt/homebrew/Caskroom/miniforge/base/envs/db/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", - " warnings.warn(\n" + "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", + " from tqdm.autonotebook import tqdm, trange\n" ] } ], @@ -60,37 +72,6 @@ "- these datasets are included in the package and can be loaded using the `TMDataset` module" ] }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Stocktwits_GME_large',\n", - " 'BBC_News',\n", - " 'Stocktwits_GME',\n", - " 'Reddit_GME',\n", - " 'Reuters',\n", - " 'Spotify',\n", - " '20NewsGroups',\n", - " 'DummyDataset',\n", - " 'Spotify_most_popular',\n", - " 'Poliblogs',\n", - " 'Spotify_least_popular']" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dataset = TMDataset()\n", - "dataset.get_dataset_list()" - ] - }, { "cell_type": "code", "execution_count": 3, @@ -100,12 +81,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-08-07 10:31:30.489\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m159\u001b[0m - \u001b[1mFetching dataset: Reuters\u001b[0m\n", - "\u001b[32m2024-08-07 10:31:31.978\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mDataset loaded successfully from /opt/homebrew/Caskroom/miniforge/base/envs/db/lib/python3.10/site-packages/stream_topic/preprocessed_datasets/Reuters\u001b[0m\n" + "\u001b[32m2024-08-09 12:13:26.847\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: Reuters\u001b[0m\n", + "\u001b[32m2024-08-09 12:13:26.914\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:13:27.147\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 12:13:27.313\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:13:27.456\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n" ] } ], "source": [ + "dataset = TMDataset()\n", "dataset.fetch_dataset(name=\"Reuters\")" ] }, @@ -181,10 +166,13 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-08-07 10:31:33.085\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m151\u001b[0m - \u001b[1mDataset name already provided while instantiating the class: Reuters\u001b[0m\n", - "\u001b[32m2024-08-07 10:31:33.086\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mOverwriting the dataset name with the provided name in fetch_dataset: Spotify\u001b[0m\n", - "\u001b[32m2024-08-07 10:31:33.086\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m156\u001b[0m - \u001b[1mFetching dataset: Spotify\u001b[0m\n", - "\u001b[32m2024-08-07 10:31:33.190\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mDataset loaded successfully from /opt/homebrew/Caskroom/miniforge/base/envs/db/lib/python3.10/site-packages/stream_topic/preprocessed_datasets/Spotify\u001b[0m\n" + "\u001b[32m2024-08-09 12:13:28.464\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m108\u001b[0m - \u001b[1mDataset name already provided while instantiating the class: Reuters\u001b[0m\n", + "\u001b[32m2024-08-09 12:13:28.464\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m111\u001b[0m - \u001b[1mOverwriting the dataset name with the name provided in fetch_dataset: Spotify\u001b[0m\n", + "\u001b[32m2024-08-09 12:13:28.465\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m115\u001b[0m - \u001b[1mFetching dataset: Spotify\u001b[0m\n", + "\u001b[32m2024-08-09 12:13:28.539\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:13:28.749\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 12:13:28.923\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:13:29.058\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n" ] } ], @@ -451,7 +439,7 @@ { "data": { "text/plain": [ - "[75, 58, 37, 45, 41]" + "[75, 58]" ] }, "execution_count": 11, @@ -460,7 +448,7 @@ } ], "source": [ - "dataset.labels[:5]" + "dataset.labels[:2]" ] }, { @@ -475,18 +463,6 @@ "execution_count": 12, "metadata": {}, "outputs": [], - "source": [ - "from stream_topic.utils import TMDataset\n", - "\n", - "import warnings\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", @@ -508,19 +484,16 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Preprocessing documents: 100%|██████████| 1000/1000 [00:03<00:00, 267.71it/s]\n", - "\u001b[32m2024-08-07 10:31:37.027\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m409\u001b[0m - \u001b[1mDataset save directory does not exist: data/\u001b[0m\n", - "\u001b[32m2024-08-07 10:31:37.027\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m410\u001b[0m - \u001b[1mCreating directory: data/\u001b[0m\n", - "\u001b[32m2024-08-07 10:31:37.031\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mDataset saved to data/sample_data.parquet\u001b[0m\n", - "\u001b[32m2024-08-07 10:31:37.032\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m430\u001b[0m - \u001b[1mDataset info saved to data/sample_data_info.pkl\u001b[0m\n", - "\u001b[32m2024-08-07 10:31:37.032\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mDataset name appended to avaliable datasets list: ['Stocktwits_GME_large', 'BBC_News', 'Stocktwits_GME', 'Reddit_GME', 'Reuters', 'Spotify', '20NewsGroups', 'DummyDataset', 'Spotify_most_popular', 'Poliblogs', 'Spotify_least_popular', 'sample_data']\u001b[0m\n" + "Preprocessing documents: 100%|██████████| 1000/1000 [00:03<00:00, 263.32it/s]\n", + "\u001b[32m2024-08-09 12:13:32.967\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m237\u001b[0m - \u001b[1mDataset saved to data/sample_data.parquet\u001b[0m\n", + "\u001b[32m2024-08-09 12:13:32.968\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mDataset info saved to data/sample_data_info.pkl\u001b[0m\n" ] } ], @@ -537,27 +510,27 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-08-07 10:31:37.036\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m159\u001b[0m - \u001b[1mFetching dataset: sample_data\u001b[0m\n", - "\u001b[32m2024-08-07 10:31:37.045\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mDataset loaded successfully from data/\u001b[0m\n" + "\u001b[32m2024-08-09 12:13:32.972\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: sample_data\u001b[0m\n", + "\u001b[32m2024-08-09 12:13:32.973\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m128\u001b[0m - \u001b[1mFetching dataset from local path\u001b[0m\n" ] } ], "source": [ "# the new data is saved in the data folder unlike the default datasets which are saved in package directory under preprocessed_data folder.\n", "# therefore, you need to provide the path to the data folder to fetch the dataset\n", - "dataset.fetch_dataset(name=\"sample_data\", dataset_path=\"data/\")" + "dataset.fetch_dataset(name=\"sample_data\", dataset_path=\"data/\", source=\"local\")" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -630,7 +603,7 @@ "4 BGHXO 3 [BGHXO]" ] }, - "execution_count": 16, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -639,6 +612,13 @@ "dataset.dataframe.head()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -663,7 +643,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.0" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/docs/notebooks/datasets.md b/docs/notebooks/datasets.md deleted file mode 100644 index 32eb43deae..0000000000 --- a/docs/notebooks/datasets.md +++ /dev/null @@ -1,139 +0,0 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.16.4 -kernelspec: - display_name: topicm - language: python - name: python3 ---- - -[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/develop/docs/notebooks/datasets.ipynb) -[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/develop/docs/notebooks/datasets.ipynb) - -# Datasets - -+++ - -The dataset module provides and easy way to load and preprocess the datasets. The package comes with a few datasets that are commonly used in topic modleing research. The datasets are: - - - 20NewsGroup - - BBC_News - - Stocktwits_GME - - Reddit_GME' - - Reuters' - - Spotify - - Spotify_most_popular - - Poliblogs - - Spotify_least_popular - -Please see the functionalities availabe in the `TMDataset` module. - -```{code-cell} ipython3 -from stream_topic.utils import TMDataset - -import warnings -warnings.filterwarnings("ignore") -``` - -## Using default datasets - -- these datasets are already preprocessed and ready to be used for topic modeling -- these datasets are included in the package and can be loaded using the `TMDataset` module - -```{code-cell} ipython3 -dataset = TMDataset() -dataset.get_dataset_list() -``` - -```{code-cell} ipython3 -dataset.fetch_dataset(name="Reuters") -``` - -```{code-cell} ipython3 -dataset.get_bow() -``` - -```{code-cell} ipython3 -dataset.get_tfidf() -``` - -```{code-cell} ipython3 -# dataset.get_word_embeddings() -``` - -```{code-cell} ipython3 -dataset.fetch_dataset('Spotify') -``` - -```{code-cell} ipython3 -dataset.dataframe.head() -``` - -```{code-cell} ipython3 -dataset.texts[:2] -``` - -```{code-cell} ipython3 -dataset.tokens -``` - -```{code-cell} ipython3 -dataset.labels[:5] -``` - -## Loading own dataset - -```{code-cell} ipython3 -from stream_topic.utils import TMDataset - -import warnings -warnings.filterwarnings("ignore") -``` - -```{code-cell} ipython3 -import pandas as pd -import numpy as np - - -# Simulating some example data -np.random.seed(0) - -# Generate 1000 random strings of lengths between 1 and 5, containing letters 'A' to 'Z' -random_documents = [''.join(np.random.choice(list('ABCDEFGHIJKLMNOPQRSTUVWXYZ'), - np.random.randint(1, 6))) for _ in range(1000)] - -# Generate 1000 random labels from 1 to 4 as strings -random_labels = np.random.choice(['1', '2', '3', '4'], 1000) - -# Create DataFrame -my_data = pd.DataFrame({"Documents": random_documents, "Labels": random_labels}) -``` - -```{code-cell} ipython3 -dataset = TMDataset() -dataset.create_load_save_dataset( - data=my_data, - dataset_name="sample_data", - save_dir="data/", - doc_column="Documents", - label_column="Labels" - ) -``` - -```{code-cell} ipython3 -# the new data is saved in the data folder unlike the default datasets which are saved in package directory under preprocessed_data folder. -# therefore, you need to provide the path to the data folder to fetch the dataset -dataset.fetch_dataset(name="sample_data", dataset_path="data/") -``` - -```{code-cell} ipython3 -dataset.dataframe.head() -``` - -```{code-cell} ipython3 - -``` diff --git a/docs/notebooks/examples.ipynb b/docs/notebooks/examples.ipynb index 04486e068d..254270423b 100644 --- a/docs/notebooks/examples.ipynb +++ b/docs/notebooks/examples.ipynb @@ -4,17 +4,38 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/develop/docs/notebooks/examples.ipynb)\n", - "[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/develop/docs/notebooks/examples.ipynb)\n", + "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/main/docs/notebooks/examples.ipynb)\n", + "[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/main/docs/notebooks/examples.ipynb)\n", "\n", "# Examples" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 1, "metadata": {}, "outputs": [], + "source": [ + "# uncomment the below line if running in Colab\n", + "# package neeeds to be installed for the notebook to run\n", + "\n", + "# ! pip install -U stream_topic" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", + " from tqdm.autonotebook import tqdm, trange\n" + ] + } + ], "source": [ "from stream_topic.models import KmeansTM\n", "from stream_topic.utils import TMDataset" @@ -29,16 +50,19 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-08-07 10:42:28.407\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m162\u001b[0m - \u001b[1mFetching dataset: BBC_News\u001b[0m\n", - "\u001b[32m2024-08-07 10:42:28.494\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m168\u001b[0m - \u001b[1mDataset loaded successfully from c:\\users\\anton\\desktop\\projects\\stream\\stream_topic\\preprocessed_datasets\\BBC_News\u001b[0m\n", - "Preprocessing documents: 100%|██████████| 2225/2225 [00:29<00:00, 75.02it/s] \n" + "\u001b[32m2024-08-09 12:13:58.725\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: BBC_News\u001b[0m\n", + "\u001b[32m2024-08-09 12:13:58.815\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:13:59.016\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 12:13:59.135\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:13:59.267\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "Preprocessing documents: 100%|██████████| 2225/2225 [00:11<00:00, 198.75it/s]\n" ] } ], @@ -50,80 +74,103 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[I 2024-08-07 10:42:58,290] A new study created in memory with name: no-name-c7f401f5-97c5-4cdb-857b-6f50683772d8\n", - "\u001b[32m2024-08-07 10:42:58.290\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:42:58.290\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:42:58.290\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:14.207\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:15.111\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-07 10:43:15,111] Trial 0 finished with value: -2908.3807001524565 and parameters: {'n_topics': 5, 'n_neighbors': 25, 'n_components': 45, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 12, 'max_iter': 521}. Best is trial 0 with value: -2908.3807001524565.\n", - "\u001b[32m2024-08-07 10:43:15.111\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:15.111\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:15.111\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:24.245\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:25.067\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-07 10:43:25,069] Trial 1 finished with value: -3369.708145534729 and parameters: {'n_topics': 8, 'n_neighbors': 44, 'n_components': 15, 'metric': 'euclidean', 'init': 'random', 'n_init': 18, 'max_iter': 719}. Best is trial 1 with value: -3369.708145534729.\n", - "\u001b[32m2024-08-07 10:43:25.072\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:25.072\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:25.073\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:31.847\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:32.734\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-07 10:43:32,734] Trial 2 finished with value: -2352.3949029947194 and parameters: {'n_topics': 13, 'n_neighbors': 10, 'n_components': 11, 'metric': 'euclidean', 'init': 'k-means++', 'n_init': 11, 'max_iter': 747}. Best is trial 1 with value: -3369.708145534729.\n", - "\u001b[32m2024-08-07 10:43:32.743\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:32.743\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:32.743\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:43.007\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:44.275\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-07 10:43:44,275] Trial 3 finished with value: -3050.2628007635785 and parameters: {'n_topics': 8, 'n_neighbors': 30, 'n_components': 31, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 30, 'max_iter': 399}. Best is trial 1 with value: -3369.708145534729.\n", - "\u001b[32m2024-08-07 10:43:44.275\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:44.275\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:44.275\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:53.013\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:53.935\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-07 10:43:53,936] Trial 4 finished with value: -3077.4218904505196 and parameters: {'n_topics': 4, 'n_neighbors': 31, 'n_components': 32, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 21, 'max_iter': 228}. Best is trial 1 with value: -3369.708145534729.\n", - "\u001b[32m2024-08-07 10:43:53.936\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:53.936\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:43:53.936\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:04.137\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:05.323\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-07 10:44:05,325] Trial 5 finished with value: -3316.4025635845546 and parameters: {'n_topics': 8, 'n_neighbors': 47, 'n_components': 39, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 22, 'max_iter': 664}. Best is trial 1 with value: -3369.708145534729.\n", - "\u001b[32m2024-08-07 10:44:05.325\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:05.325\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:05.325\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:14.288\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:15.314\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-07 10:44:15,314] Trial 6 finished with value: -2375.983772695624 and parameters: {'n_topics': 3, 'n_neighbors': 14, 'n_components': 35, 'metric': 'euclidean', 'init': 'random', 'n_init': 20, 'max_iter': 889}. Best is trial 1 with value: -3369.708145534729.\n", - "\u001b[32m2024-08-07 10:44:15.314\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:15.314\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:15.314\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:23.368\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:24.310\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-07 10:44:24,310] Trial 7 finished with value: -2731.111202206049 and parameters: {'n_topics': 15, 'n_neighbors': 19, 'n_components': 10, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 14, 'max_iter': 492}. Best is trial 1 with value: -3369.708145534729.\n", - "\u001b[32m2024-08-07 10:44:24.310\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:24.310\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:24.310\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:31.870\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:32.529\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-07 10:44:32,529] Trial 8 finished with value: -1749.2967260199991 and parameters: {'n_topics': 13, 'n_neighbors': 10, 'n_components': 18, 'metric': 'cosine', 'init': 'random', 'n_init': 11, 'max_iter': 412}. Best is trial 1 with value: -3369.708145534729.\n", - "\u001b[32m2024-08-07 10:44:32.529\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:32.529\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:32.529\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:41.535\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:42.270\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-07 10:44:42,270] Trial 9 finished with value: -2846.653885685785 and parameters: {'n_topics': 15, 'n_neighbors': 29, 'n_components': 29, 'metric': 'cosine', 'init': 'random', 'n_init': 21, 'max_iter': 428}. Best is trial 1 with value: -3369.708145534729.\n", - "\u001b[32m2024-08-07 10:44:42.270\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36moptimize_hyperparameters\u001b[0m:\u001b[36m389\u001b[0m - \u001b[1mOptimal parameters: {'n_neighbors': 44, 'n_components': 15, 'metric': 'euclidean', 'init': 'random', 'n_init': 18, 'max_iter': 719} with 8 topics based on AIC.\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:42.270\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:42.270\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:42.270\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:51.236\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:44:51.937\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n" + "[I 2024-08-09 12:14:10,511] A new study created in memory with name: no-name-5aa7ebf6-4f37-4290-86e6-cebeec5398df\n", + "\u001b[32m2024-08-09 12:14:10.513\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:10.577\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:10.648\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:10.866\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:10.875\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n", + "\u001b[32m2024-08-09 12:14:16.104\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:16.705\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 12:14:16,706] Trial 0 finished with value: -2747.044881484823 and parameters: {'n_topics': 4, 'n_neighbors': 24, 'n_components': 30, 'metric': 'cosine', 'init': 'random', 'n_init': 16, 'max_iter': 499}. Best is trial 0 with value: -2747.044881484823.\n", + "\u001b[32m2024-08-09 12:14:16.708\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:16.778\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:16.847\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:17.117\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:17.118\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:19.615\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:20.994\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 12:14:20,995] Trial 1 finished with value: -2453.3441802457037 and parameters: {'n_topics': 13, 'n_neighbors': 11, 'n_components': 25, 'metric': 'euclidean', 'init': 'k-means++', 'n_init': 18, 'max_iter': 191}. Best is trial 0 with value: -2747.044881484823.\n", + "\u001b[32m2024-08-09 12:14:20.997\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:21.075\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:21.154\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:21.376\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:21.377\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:24.749\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:25.399\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 12:14:25,400] Trial 2 finished with value: -3128.164941984999 and parameters: {'n_topics': 12, 'n_neighbors': 38, 'n_components': 43, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 10, 'max_iter': 829}. Best is trial 2 with value: -3128.164941984999.\n", + "\u001b[32m2024-08-09 12:14:25.401\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:25.474\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:25.570\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:25.790\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:25.791\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:28.291\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:28.612\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 12:14:28,613] Trial 3 finished with value: -3134.0457564239455 and parameters: {'n_topics': 7, 'n_neighbors': 29, 'n_components': 32, 'metric': 'euclidean', 'init': 'random', 'n_init': 15, 'max_iter': 318}. Best is trial 3 with value: -3134.0457564239455.\n", + "\u001b[32m2024-08-09 12:14:28.614\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:28.685\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:28.753\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:28.952\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:28.953\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:32.261\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:33.410\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 12:14:33,411] Trial 4 finished with value: -3016.9434231651817 and parameters: {'n_topics': 15, 'n_neighbors': 31, 'n_components': 45, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 19, 'max_iter': 481}. Best is trial 3 with value: -3134.0457564239455.\n", + "\u001b[32m2024-08-09 12:14:33.413\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:33.485\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:33.564\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:33.776\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:33.777\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:36.292\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:36.600\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 12:14:36,601] Trial 5 finished with value: -3055.0335652501776 and parameters: {'n_topics': 18, 'n_neighbors': 29, 'n_components': 8, 'metric': 'euclidean', 'init': 'random', 'n_init': 12, 'max_iter': 496}. Best is trial 3 with value: -3134.0457564239455.\n", + "\u001b[32m2024-08-09 12:14:36.602\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:36.678\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:36.759\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:36.979\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:36.980\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:39.683\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:40.085\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 12:14:40,086] Trial 6 finished with value: -3156.918997163559 and parameters: {'n_topics': 17, 'n_neighbors': 31, 'n_components': 30, 'metric': 'euclidean', 'init': 'random', 'n_init': 28, 'max_iter': 164}. Best is trial 6 with value: -3156.918997163559.\n", + "\u001b[32m2024-08-09 12:14:40.088\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:40.158\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:40.233\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:40.464\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:40.465\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:43.529\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:45.069\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 12:14:45,070] Trial 7 finished with value: -2489.982411751191 and parameters: {'n_topics': 9, 'n_neighbors': 15, 'n_components': 35, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 25, 'max_iter': 457}. Best is trial 6 with value: -3156.918997163559.\n", + "\u001b[32m2024-08-09 12:14:45.072\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:45.145\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:45.224\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:45.458\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:45.460\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:48.272\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:48.598\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 12:14:48,599] Trial 8 finished with value: -1863.7949187027652 and parameters: {'n_topics': 8, 'n_neighbors': 10, 'n_components': 17, 'metric': 'cosine', 'init': 'random', 'n_init': 20, 'max_iter': 354}. Best is trial 6 with value: -3156.918997163559.\n", + "\u001b[32m2024-08-09 12:14:48.600\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:48.668\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:48.750\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:48.978\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:48.979\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:52.247\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:52.643\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 12:14:52,644] Trial 9 finished with value: -3223.5689310849125 and parameters: {'n_topics': 11, 'n_neighbors': 42, 'n_components': 8, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 25, 'max_iter': 568}. Best is trial 9 with value: -3223.5689310849125.\n", + "\u001b[32m2024-08-09 12:14:52.644\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36moptimize_hyperparameters\u001b[0m:\u001b[36m389\u001b[0m - \u001b[1mOptimal parameters: {'n_neighbors': 42, 'n_components': 8, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 25, 'max_iter': 568} with 11 topics based on AIC.\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:52.645\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:52.939\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:53.010\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:53.214\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:53.215\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:56.152\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:14:56.525\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n" ] } ], @@ -134,14 +181,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "8\n" + "11\n" ] } ], @@ -159,16 +206,24 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, { "data": { "text/plain": [ - "0.15247" + "0.23371" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -182,16 +237,16 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.20205374993383884" + "0.2090254547921094" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -200,13 +255,20 @@ "isim_metric = ISIM()\n", "isim_metric.score(topics)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python (stream_topic_venv)", + "display_name": "db", "language": "python", - "name": "stream_topic_venv" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/docs/notebooks/quickstart.ipynb b/docs/notebooks/quickstart.ipynb index 8eabe8e700..8286df3f5f 100644 --- a/docs/notebooks/quickstart.ipynb +++ b/docs/notebooks/quickstart.ipynb @@ -4,12 +4,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/develop/docs/notebooks/quickstart.ipynb)\n", - "[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/develop/docs/notebooks/quickstart.ipynb)\n", + "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/main/docs/notebooks/quickstart.ipynb)\n", + "[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/main/docs/notebooks/quickstart.ipynb)\n", "\n", "# Quickstart" ] }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# uncomment the below line if running in Colab\n", + "# package neeeds to be installed for the notebook to run\n", + "\n", + "# ! pip install -U stream_topic" + ] + }, { "cell_type": "code", "execution_count": 8, @@ -36,6 +48,12 @@ "name": "stderr", "output_type": "stream", "text": [ + "\u001b[32m2024-08-09 12:34:06.391\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: BBC_News\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:06.592\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:06.796\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:07.111\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:07.250\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "Preprocessing documents: 100%|██████████| 2225/2225 [00:11<00:00, 200.32it/s]\n" ] } ], @@ -54,13 +72,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-08-07 10:53:54.527\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m241\u001b[0m - \u001b[1m--- Training CEDC topic model ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:53:54.527\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:53:54.537\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:54:03.304\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m175\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:54:05.490\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m259\u001b[0m - \u001b[1m--- Extract topics ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:54:21.731\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m284\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n" - + "\u001b[32m2024-08-09 12:34:18.398\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m241\u001b[0m - \u001b[1m--- Training CEDC topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:18.695\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:18.781\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:18.993\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:19.000\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:22.191\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m175\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning:\n", + "\n", + "`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + "\n", + "\u001b[32m2024-08-09 12:34:27.876\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m259\u001b[0m - \u001b[1m--- Extract topics ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:32.056\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m284\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n" ] } ], @@ -72,9 +95,18 @@ { "cell_type": "code", "execution_count": 11, - "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/umap/umap_.py:2437: UserWarning:\n", + "\n", + "n_neighbors is larger than the dataset size; truncating to X.shape[0] - 1\n", + "\n" + ] + }, { "data": { "text/html": [ @@ -90,8 +122,7 @@ " " ], "text/plain": [ - "" - + "" ] }, "metadata": {}, @@ -118,24 +149,32 @@ { "cell_type": "code", "execution_count": 12, - "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-08-07 10:54:31.049\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m162\u001b[0m - \u001b[1mFetching dataset: BBC_News\u001b[0m\n", - "\u001b[32m2024-08-07 10:54:31.138\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m168\u001b[0m - \u001b[1mDataset loaded successfully from c:\\users\\anton\\desktop\\projects\\stream\\stream_topic\\preprocessed_datasets\\BBC_News\u001b[0m\n", - "Preprocessing documents: 100%|██████████| 2225/2225 [00:24<00:00, 91.41it/s] \n", - "\u001b[32m2024-08-07 10:54:55.545\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:54:55.545\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36m_initialize_datamodule\u001b[0m:\u001b[36m314\u001b[0m - \u001b[1m--- Initializing Datamodule for CTMNeg ---\u001b[0m\n", - "\u001b[32m2024-08-07 10:54:56.023\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36m_initialize_trainer\u001b[0m:\u001b[36m273\u001b[0m - \u001b[1m--- Initializing Trainer for CTMNeg ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:35.593\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: BBC_News\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:35.670\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:35.869\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:35.975\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:36.099\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "Preprocessing documents: 100%|██████████| 2225/2225 [00:11<00:00, 198.65it/s]\n", + "\u001b[32m2024-08-09 12:34:47.410\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:47.483\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:47.709\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:47.717\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36m_initialize_datamodule\u001b[0m:\u001b[36m314\u001b[0m - \u001b[1m--- Initializing Datamodule for CTMNeg ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:48.183\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36m_initialize_trainer\u001b[0m:\u001b[36m273\u001b[0m - \u001b[1m--- Initializing Trainer for CTMNeg ---\u001b[0m\n", "Trainer already configured with model summary callbacks: []. Skipping setting a default `ModelSummary` callback.\n", - "GPU available: False, used: False\n", + "GPU available: True (mps), used: True\n", "TPU available: False, using: 0 TPU cores\n", "HPU available: False, using: 0 HPUs\n", - "\u001b[32m2024-08-07 10:54:56.101\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m457\u001b[0m - \u001b[1m--- Training CTMNeg topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 12:34:48.201\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m457\u001b[0m - \u001b[1m--- Training CTMNeg topic model ---\u001b[0m\n", + "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:652: UserWarning:\n", + "\n", + "Checkpoint directory /Volumes/Research/Repositories/STREAM/docs/notebooks/checkpoints exists and is not empty.\n", + "\n", "\n", " | Name | Type | Params | Mode \n", "----------------------------------------------------------------------\n", @@ -154,24 +193,66 @@ ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 0: 100%|██████████| 56/56 [00:05<00:00, 10.92it/s, v_num=1, train_loss_step=nan.0, val_loss_step=nan.0, val_loss_epoch=nan.0, train_loss_epoch=nan.0]" - ] + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "637189591d0e4df98eece6259647b46b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | | 0/? [00:00