diff --git a/README.md b/README.md index d4b06d585a..1540d88751 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ # STREAM -We present STREAM, a Simplified Topic Retrieval, Exploration, and Analysis Module for user-friendly topic modelling and especially subsequent interactive topic visualization and analysis. +We present STREAM, a Simplified Topic Retrieval, Exploration, and Analysis Module for user-friendly topic modelling and especially subsequent interactive topic visualization and analysis. Our paper can be found [here](https://aclanthology.org/2024.acl-short.41.pdf). # Table of Contents - [STREAM](#stream) @@ -46,7 +46,7 @@ We present STREAM, a Simplified Topic Retrieval, Exploration, and Analysis Modul - [Validation Criteria](#validation-criteria) - [Submitting Your Contribution](#submitting-your-contribution) - [Citation](#citation) - - [Paper 1 TBD](#paper-1-tbd) + - [STREAM](#stream-1) - [Metrics and CEDC](#metrics-and-cedc) - [TNTM](#tntm) - [DCTE](#dcte) @@ -80,6 +80,9 @@ Make additionally sure to download the necessary [nltk](https://www.nltk.org/) r ```python import nltk +nltk.download('stopwords') +nltk.download('punkt') +nltk.download('wordnet') nltk.download('averaged_perceptron_tagger') ``` @@ -604,18 +607,16 @@ If you want to include a new model where these guidelines are not approriate ple If you use this project in your research, please consider citing: -### Paper 1 TBD +### STREAM ```bibtex -@article{your_paper_key1, - title={Your Paper Title}, - author={Your Name and Co-Author's Name}, - journal={Journal/Conference Name}, - year={Year}, - volume={Volume}, - number={Number}, - pages={Pages}, - doi={link_to_doi} +@inproceedings{thielmann-etal-2024-stream, + title = {STREAM: Simplified Topic Retrieval, Exploration, and Analysis Module}, + author = {Thielmann, Anton and Reuter, Arik and Weisser, Christoph and Kant, Gillian and Kumar, Manish and S{\"a}fken, Benjamin}, + booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)}, + year = {2024}, + publisher = {Association for Computational Linguistics}, + pages = {435--444}, } ``` diff --git a/docs/installation.md b/docs/installation.md index 93c626596c..692d377e87 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -19,9 +19,13 @@ pip install stream_topic ### Install the required dependencies: -Make sure you have installed the required dependencies before installing `stream_topic`. You can install the required dependencies using the following command: +Make sure you have installed the required dependencies before running any models from `stream_topic`. You can install the required dependencies using the following command: ```bash import nltk + +nltk.download('punkt') +nltk.download('wordnet') +nltk.download('stopwords') nltk.download('averaged_perceptron_tagger') ``` \ No newline at end of file diff --git a/docs/landingpage.md b/docs/landingpage.md index ade66982de..2eed225e2c 100644 --- a/docs/landingpage.md +++ b/docs/landingpage.md @@ -1,5 +1,5 @@ # STREAM -We present STREAM, a Simplified Topic Retrieval, Exploration, and Analysis Module for user-friendly topic modelling and especially subsequent interactive topic visualization and analysis. +We present STREAM, a Simplified Topic Retrieval, Exploration, and Analysis Module for user-friendly topic modelling and especially subsequent interactive topic visualization and analysis. Our paper can be found [here](https://aclanthology.org/2024.acl-short.41.pdf). For better topic analysis, we implement multiple intruder-word based topic evaluation metrics. Additionally, we publicize multiple new datasets that can extend the so far very limited number of publicly available benchmark datasets in topic modeling. We integrate downstream interpretable analysis modules to enable users to easily analyse the created topics in downstream tasks together with additional tabular information. @@ -28,6 +28,9 @@ Make additionally sure to download the necessary [nltk](https://www.nltk.org/) r ```python import nltk +nltk.download('stopwords') +nltk.download('punkt') +nltk.download('wordnet') nltk.download('averaged_perceptron_tagger') ``` @@ -416,15 +419,13 @@ If you use this project in your research, please consider citing: ### Paper 1 TBD ```bibtex -@article{your_paper_key1, - title={Your Paper Title}, - author={Your Name and Co-Author's Name}, - journal={Journal/Conference Name}, - year={Year}, - volume={Volume}, - number={Number}, - pages={Pages}, - doi={link_to_doi} +@inproceedings{thielmann-etal-2024-stream, + title = {STREAM: Simplified Topic Retrieval, Exploration, and Analysis Module}, + author = {Thielmann, Anton and Reuter, Arik and Weisser, Christoph and Kant, Gillian and Kumar, Manish and S{\"a}fken, Benjamin}, + booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)}, + year = {2024}, + publisher = {Association for Computational Linguistics}, + pages = {435--444}, } ``` diff --git a/docs/notebooks/datasets.ipynb b/docs/notebooks/datasets.ipynb index 764e080c66..906e7a7c74 100644 --- a/docs/notebooks/datasets.ipynb +++ b/docs/notebooks/datasets.ipynb @@ -16,19 +16,33 @@ "source": [ "The dataset module provides and easy way to load and preprocess the datasets. The package comes with a few datasets that are commonly used in topic modleing research. The datasets are:\n", "\n", - " - 20NewsGroup\n", - " - BBC_News\n", - " - Stocktwits_GME\n", - " - Reddit_GME'\n", - " - Reuters'\n", - " - Spotify\n", - " - Spotify_most_popular\n", - " - Poliblogs\n", - " - Spotify_least_popular\n", + "- 20NewsGroup\n", + "- BBC_News\n", + "- Stocktwits_GME\n", + "- Reddit_GME'\n", + "- Reuters'\n", + "- Spotify\n", + "- Spotify_most_popular\n", + "- Poliblogs\n", + "- Spotify_least_popular\n", "\n", "Please see the functionalities availabe in the `TMDataset` module." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: Make sure the `nltk` dependencies are installed. If not, please run the following command:\n", + "```python\n", + "import nltk\n", + "nltk.download('stopwords')\n", + "nltk.download('punkt')\n", + "nltk.download('wordnet')\n", + "nltk.download('averaged_perceptron_tagger')\n", + "```" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -45,23 +59,21 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", - " from tqdm.autonotebook import tqdm, trange\n" - ] - } - ], + "outputs": [], "source": [ - "from stream_topic.utils import TMDataset\n", - "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from stream_topic.utils import TMDataset" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -74,18 +86,18 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-08-09 12:13:26.847\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: Reuters\u001b[0m\n", - "\u001b[32m2024-08-09 12:13:26.914\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:13:27.147\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "\u001b[32m2024-08-09 12:13:27.313\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:13:27.456\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n" + "\u001b[32m2024-08-09 15:32:39.680\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: Reuters\u001b[0m\n", + "\u001b[32m2024-08-09 15:32:40.002\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:32:40.363\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 15:32:40.757\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:32:40.970\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n" ] } ], @@ -96,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -112,7 +124,7 @@ " array(['00', '000', '001', ..., 'zurich', 'zverev', 'zzzz'], dtype=object))" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -123,7 +135,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -139,7 +151,7 @@ " array(['00', '000', '001', ..., 'zurich', 'zverev', 'zzzz'], dtype=object))" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -150,7 +162,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -159,20 +171,20 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-08-09 12:13:28.464\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m108\u001b[0m - \u001b[1mDataset name already provided while instantiating the class: Reuters\u001b[0m\n", - "\u001b[32m2024-08-09 12:13:28.464\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m111\u001b[0m - \u001b[1mOverwriting the dataset name with the name provided in fetch_dataset: Spotify\u001b[0m\n", - "\u001b[32m2024-08-09 12:13:28.465\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m115\u001b[0m - \u001b[1mFetching dataset: Spotify\u001b[0m\n", - "\u001b[32m2024-08-09 12:13:28.539\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:13:28.749\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "\u001b[32m2024-08-09 12:13:28.923\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:13:29.058\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n" + "\u001b[32m2024-08-09 15:32:42.196\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m108\u001b[0m - \u001b[1mDataset name already provided while instantiating the class: Reuters\u001b[0m\n", + "\u001b[32m2024-08-09 15:32:42.196\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m111\u001b[0m - \u001b[1mOverwriting the dataset name with the name provided in fetch_dataset: Spotify\u001b[0m\n", + "\u001b[32m2024-08-09 15:32:42.196\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m115\u001b[0m - \u001b[1mFetching dataset: Spotify\u001b[0m\n", + "\u001b[32m2024-08-09 15:32:42.490\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:32:43.475\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 15:32:43.813\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:32:43.977\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n" ] } ], @@ -182,7 +194,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -392,7 +404,7 @@ "4 [alexia, and, ice, you, the, came, down, you, ... " ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -403,7 +415,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -413,7 +425,7 @@ " 'seinabo sey have always wondered your cause when you you can not and ive been you your just see one seinabo sey will you ever tell what really your mind cause the greatest love youll ever find seinabo sey moving life better shores you see shores you see sure moving somebody thats sure sure sure see moving moving have always wondered you your like every word the ive always been and every word but the seinabo sey vargas lagola will you ever tell what really your mind cause the greatest love youll ever find you know seinabo sey vargas lagola seinabo sey moving life better shores you see shores you see sure moving somebody thats sure sure sure see moving cause not seinabo sey vargas lagola seinabo sey and you cant hold heart heart heart heart and you cant hold heart heart heart heart you cant hold heart heart hold heart and you cant hold heart heart heart and you cant hold heart heart you cant you cant you cant you cant hold heart heart heart heart heart moving moving ohohooh ohohooh ohohooh']" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -424,7 +436,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -433,7 +445,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -442,7 +454,7 @@ "[75, 58]" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -460,7 +472,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -484,16 +496,16 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Preprocessing documents: 100%|██████████| 1000/1000 [00:03<00:00, 263.32it/s]\n", - "\u001b[32m2024-08-09 12:13:32.967\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m237\u001b[0m - \u001b[1mDataset saved to data/sample_data.parquet\u001b[0m\n", - "\u001b[32m2024-08-09 12:13:32.968\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mDataset info saved to data/sample_data_info.pkl\u001b[0m\n" + "Preprocessing documents: 100%|██████████| 1000/1000 [00:03<00:00, 251.82it/s]\n", + "\u001b[32m2024-08-09 15:32:48.092\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m237\u001b[0m - \u001b[1mDataset saved to data/sample_data.parquet\u001b[0m\n", + "\u001b[32m2024-08-09 15:32:48.093\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mDataset info saved to data/sample_data_info.pkl\u001b[0m\n" ] } ], @@ -510,15 +522,15 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-08-09 12:13:32.972\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: sample_data\u001b[0m\n", - "\u001b[32m2024-08-09 12:13:32.973\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m128\u001b[0m - \u001b[1mFetching dataset from local path\u001b[0m\n" + "\u001b[32m2024-08-09 15:32:48.097\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: sample_data\u001b[0m\n", + "\u001b[32m2024-08-09 15:32:48.098\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m128\u001b[0m - \u001b[1mFetching dataset from local path\u001b[0m\n" ] } ], @@ -530,7 +542,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -603,7 +615,7 @@ "4 BGHXO 3 [BGHXO]" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -612,13 +624,6 @@ "dataset.dataframe.head()" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, diff --git a/docs/notebooks/examples.ipynb b/docs/notebooks/examples.ipynb index 254270423b..4b22e8a42b 100644 --- a/docs/notebooks/examples.ipynb +++ b/docs/notebooks/examples.ipynb @@ -10,6 +10,20 @@ "# Examples" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: Make sure the `nltk` dependencies are installed. If not, please run the following command:\n", + "```python\n", + "import nltk\n", + "nltk.download('stopwords')\n", + "nltk.download('punkt')\n", + "nltk.download('wordnet')\n", + "nltk.download('averaged_perceptron_tagger')\n", + "```" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -26,16 +40,17 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", - " from tqdm.autonotebook import tqdm, trange\n" - ] - } - ], + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], "source": [ "from stream_topic.models import KmeansTM\n", "from stream_topic.utils import TMDataset" @@ -50,19 +65,19 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-08-09 12:13:58.725\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: BBC_News\u001b[0m\n", - "\u001b[32m2024-08-09 12:13:58.815\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:13:59.016\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "\u001b[32m2024-08-09 12:13:59.135\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:13:59.267\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "Preprocessing documents: 100%|██████████| 2225/2225 [00:11<00:00, 198.75it/s]\n" + "\u001b[32m2024-08-09 15:33:16.644\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: BBC_News\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:17.193\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:17.848\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:18.133\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:18.324\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "Preprocessing documents: 100%|██████████| 2225/2225 [00:11<00:00, 198.41it/s]\n" ] } ], @@ -74,103 +89,103 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[I 2024-08-09 12:14:10,511] A new study created in memory with name: no-name-5aa7ebf6-4f37-4290-86e6-cebeec5398df\n", - "\u001b[32m2024-08-09 12:14:10.513\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:10.577\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:10.648\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:10.866\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:10.875\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "[I 2024-08-09 15:33:29,603] A new study created in memory with name: no-name-882315ac-44ed-4d90-9fc1-cff18636e26d\n", + "\u001b[32m2024-08-09 15:33:29.606\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:30.201\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:30.285\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:31.073\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:31.083\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n", - "\u001b[32m2024-08-09 12:14:16.104\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:16.705\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-09 12:14:16,706] Trial 0 finished with value: -2747.044881484823 and parameters: {'n_topics': 4, 'n_neighbors': 24, 'n_components': 30, 'metric': 'cosine', 'init': 'random', 'n_init': 16, 'max_iter': 499}. Best is trial 0 with value: -2747.044881484823.\n", - "\u001b[32m2024-08-09 12:14:16.708\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:16.778\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:16.847\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:17.117\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:17.118\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:19.615\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:20.994\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-09 12:14:20,995] Trial 1 finished with value: -2453.3441802457037 and parameters: {'n_topics': 13, 'n_neighbors': 11, 'n_components': 25, 'metric': 'euclidean', 'init': 'k-means++', 'n_init': 18, 'max_iter': 191}. Best is trial 0 with value: -2747.044881484823.\n", - "\u001b[32m2024-08-09 12:14:20.997\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:21.075\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:21.154\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:21.376\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:21.377\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:24.749\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:25.399\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-09 12:14:25,400] Trial 2 finished with value: -3128.164941984999 and parameters: {'n_topics': 12, 'n_neighbors': 38, 'n_components': 43, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 10, 'max_iter': 829}. Best is trial 2 with value: -3128.164941984999.\n", - "\u001b[32m2024-08-09 12:14:25.401\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:25.474\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:25.570\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:25.790\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:25.791\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:28.291\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:28.612\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-09 12:14:28,613] Trial 3 finished with value: -3134.0457564239455 and parameters: {'n_topics': 7, 'n_neighbors': 29, 'n_components': 32, 'metric': 'euclidean', 'init': 'random', 'n_init': 15, 'max_iter': 318}. Best is trial 3 with value: -3134.0457564239455.\n", - "\u001b[32m2024-08-09 12:14:28.614\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:28.685\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:28.753\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:28.952\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:28.953\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:32.261\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:33.410\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-09 12:14:33,411] Trial 4 finished with value: -3016.9434231651817 and parameters: {'n_topics': 15, 'n_neighbors': 31, 'n_components': 45, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 19, 'max_iter': 481}. Best is trial 3 with value: -3134.0457564239455.\n", - "\u001b[32m2024-08-09 12:14:33.413\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:33.485\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:33.564\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:33.776\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:33.777\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:36.292\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:36.600\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-09 12:14:36,601] Trial 5 finished with value: -3055.0335652501776 and parameters: {'n_topics': 18, 'n_neighbors': 29, 'n_components': 8, 'metric': 'euclidean', 'init': 'random', 'n_init': 12, 'max_iter': 496}. Best is trial 3 with value: -3134.0457564239455.\n", - "\u001b[32m2024-08-09 12:14:36.602\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:36.678\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:36.759\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:36.979\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:36.980\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:39.683\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:40.085\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-09 12:14:40,086] Trial 6 finished with value: -3156.918997163559 and parameters: {'n_topics': 17, 'n_neighbors': 31, 'n_components': 30, 'metric': 'euclidean', 'init': 'random', 'n_init': 28, 'max_iter': 164}. Best is trial 6 with value: -3156.918997163559.\n", - "\u001b[32m2024-08-09 12:14:40.088\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:40.158\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:40.233\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:40.464\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:40.465\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:43.529\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:45.069\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-09 12:14:45,070] Trial 7 finished with value: -2489.982411751191 and parameters: {'n_topics': 9, 'n_neighbors': 15, 'n_components': 35, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 25, 'max_iter': 457}. Best is trial 6 with value: -3156.918997163559.\n", - "\u001b[32m2024-08-09 12:14:45.072\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:45.145\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:45.224\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:45.458\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:45.460\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:48.272\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:48.598\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-09 12:14:48,599] Trial 8 finished with value: -1863.7949187027652 and parameters: {'n_topics': 8, 'n_neighbors': 10, 'n_components': 17, 'metric': 'cosine', 'init': 'random', 'n_init': 20, 'max_iter': 354}. Best is trial 6 with value: -3156.918997163559.\n", - "\u001b[32m2024-08-09 12:14:48.600\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:48.668\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:48.750\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:48.978\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:48.979\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:52.247\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:52.643\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", - "[I 2024-08-09 12:14:52,644] Trial 9 finished with value: -3223.5689310849125 and parameters: {'n_topics': 11, 'n_neighbors': 42, 'n_components': 8, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 25, 'max_iter': 568}. Best is trial 9 with value: -3223.5689310849125.\n", - "\u001b[32m2024-08-09 12:14:52.644\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36moptimize_hyperparameters\u001b[0m:\u001b[36m389\u001b[0m - \u001b[1mOptimal parameters: {'n_neighbors': 42, 'n_components': 8, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 25, 'max_iter': 568} with 11 topics based on AIC.\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:52.645\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:52.939\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:53.010\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:53.214\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:53.215\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:56.152\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:14:56.525\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n" + "\u001b[32m2024-08-09 15:33:35.407\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:36.016\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 15:33:36,017] Trial 0 finished with value: -2463.7082266615807 and parameters: {'n_topics': 14, 'n_neighbors': 12, 'n_components': 6, 'metric': 'euclidean', 'init': 'random', 'n_init': 23, 'max_iter': 174}. Best is trial 0 with value: -2463.7082266615807.\n", + "\u001b[32m2024-08-09 15:33:36.018\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:36.102\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:36.178\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:36.746\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:36.746\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:40.089\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:40.457\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 15:33:40,458] Trial 1 finished with value: -2946.160921364957 and parameters: {'n_topics': 19, 'n_neighbors': 36, 'n_components': 27, 'metric': 'cosine', 'init': 'random', 'n_init': 26, 'max_iter': 766}. Best is trial 1 with value: -2946.160921364957.\n", + "\u001b[32m2024-08-09 15:33:40.459\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:40.554\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:40.644\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:40.978\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:40.978\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:43.766\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:44.160\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 15:33:44,161] Trial 2 finished with value: -3400.3953215739325 and parameters: {'n_topics': 7, 'n_neighbors': 48, 'n_components': 48, 'metric': 'euclidean', 'init': 'random', 'n_init': 29, 'max_iter': 231}. Best is trial 2 with value: -3400.3953215739325.\n", + "\u001b[32m2024-08-09 15:33:44.162\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:44.242\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:44.325\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:44.683\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:44.684\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:47.631\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:47.997\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 15:33:47,998] Trial 3 finished with value: -2834.7694686925297 and parameters: {'n_topics': 13, 'n_neighbors': 26, 'n_components': 12, 'metric': 'cosine', 'init': 'random', 'n_init': 25, 'max_iter': 379}. Best is trial 2 with value: -3400.3953215739325.\n", + "\u001b[32m2024-08-09 15:33:48.000\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:48.096\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:48.170\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:48.445\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:48.445\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:51.185\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:52.744\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 15:33:52,745] Trial 4 finished with value: -3160.985634056173 and parameters: {'n_topics': 12, 'n_neighbors': 28, 'n_components': 38, 'metric': 'euclidean', 'init': 'k-means++', 'n_init': 24, 'max_iter': 547}. Best is trial 2 with value: -3400.3953215739325.\n", + "\u001b[32m2024-08-09 15:33:52.746\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:52.841\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:52.917\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:53.191\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:53.193\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:56.358\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:56.905\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 15:33:56,906] Trial 5 finished with value: -2794.15342912206 and parameters: {'n_topics': 14, 'n_neighbors': 22, 'n_components': 10, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 13, 'max_iter': 776}. Best is trial 2 with value: -3400.3953215739325.\n", + "\u001b[32m2024-08-09 15:33:56.908\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:57.004\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:57.090\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:57.806\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 15:33:57.807\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:00.880\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:01.233\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 15:34:01,234] Trial 6 finished with value: -3059.2387453702095 and parameters: {'n_topics': 15, 'n_neighbors': 37, 'n_components': 20, 'metric': 'cosine', 'init': 'random', 'n_init': 20, 'max_iter': 976}. Best is trial 2 with value: -3400.3953215739325.\n", + "\u001b[32m2024-08-09 15:34:01.236\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:01.316\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:01.408\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:01.673\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:01.673\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:04.729\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:05.132\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 15:34:05,134] Trial 7 finished with value: -3027.354335724434 and parameters: {'n_topics': 18, 'n_neighbors': 37, 'n_components': 24, 'metric': 'cosine', 'init': 'random', 'n_init': 30, 'max_iter': 728}. Best is trial 2 with value: -3400.3953215739325.\n", + "\u001b[32m2024-08-09 15:34:05.135\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:05.210\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:05.284\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:05.681\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:05.682\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:08.518\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:08.838\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 15:34:08,838] Trial 8 finished with value: -2060.2771839726806 and parameters: {'n_topics': 3, 'n_neighbors': 12, 'n_components': 18, 'metric': 'cosine', 'init': 'random', 'n_init': 15, 'max_iter': 792}. Best is trial 2 with value: -3400.3953215739325.\n", + "\u001b[32m2024-08-09 15:34:08.840\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:08.927\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:09.008\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:09.634\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:09.635\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:12.202\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:12.564\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n", + "[I 2024-08-09 15:34:12,565] Trial 9 finished with value: -2582.634045394511 and parameters: {'n_topics': 16, 'n_neighbors': 16, 'n_components': 21, 'metric': 'euclidean', 'init': 'random', 'n_init': 23, 'max_iter': 608}. Best is trial 2 with value: -3400.3953215739325.\n", + "\u001b[32m2024-08-09 15:34:12.566\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36moptimize_hyperparameters\u001b[0m:\u001b[36m389\u001b[0m - \u001b[1mOptimal parameters: {'n_neighbors': 48, 'n_components': 48, 'metric': 'euclidean', 'init': 'random', 'n_init': 29, 'max_iter': 231} with 7 topics based on AIC.\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:12.567\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m206\u001b[0m - \u001b[1m--- Training KmeansTM topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:12.664\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:12.735\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:13.155\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:13.156\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:15.585\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:34:15.897\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.KmeansTM\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m240\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n" ] } ], @@ -181,14 +196,14 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "11\n" + "7\n" ] } ], @@ -206,24 +221,16 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", - " warnings.warn(\n" - ] - }, { "data": { "text/plain": [ - "0.23371" + "0.19318" ] }, - "execution_count": 6, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -237,16 +244,16 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.2090254547921094" + "0.18481285870075226" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } diff --git a/docs/notebooks/quickstart.ipynb b/docs/notebooks/quickstart.ipynb index 8286df3f5f..4ea1fb54eb 100644 --- a/docs/notebooks/quickstart.ipynb +++ b/docs/notebooks/quickstart.ipynb @@ -10,9 +10,24 @@ "# Quickstart" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "**Note**: Make sure the `nltk` dependencies are installed. If not, please run the following command:\n", + "```python\n", + "import nltk\n", + "nltk.download('stopwords')\n", + "nltk.download('punkt')\n", + "nltk.download('wordnet')\n", + "nltk.download('averaged_perceptron_tagger')\n", + "```" + ] + }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -24,7 +39,17 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -41,19 +66,19 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-08-09 12:34:06.391\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: BBC_News\u001b[0m\n", - "\u001b[32m2024-08-09 12:34:06.592\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:34:06.796\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "\u001b[32m2024-08-09 12:34:07.111\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:34:07.250\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "Preprocessing documents: 100%|██████████| 2225/2225 [00:11<00:00, 200.32it/s]\n" + "\u001b[32m2024-08-09 15:35:15.170\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: BBC_News\u001b[0m\n", + "\u001b[32m2024-08-09 15:35:15.244\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:35:15.518\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 15:35:15.663\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:35:15.795\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "Preprocessing documents: 100%|██████████| 2225/2225 [00:11<00:00, 198.52it/s]\n" ] } ], @@ -65,25 +90,22 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-08-09 12:34:18.398\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m241\u001b[0m - \u001b[1m--- Training CEDC topic model ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:34:18.695\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:34:18.781\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:34:18.993\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "\u001b[32m2024-08-09 12:34:19.000\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:34:22.191\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m175\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", - "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning:\n", - "\n", - "`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", - "\n", - "\u001b[32m2024-08-09 12:34:27.876\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m259\u001b[0m - \u001b[1m--- Extract topics ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:34:32.056\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m284\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n" + "\u001b[32m2024-08-09 15:35:27.056\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m241\u001b[0m - \u001b[1m--- Training CEDC topic model ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:35:27.122\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:35:27.191\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:35:27.416\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 15:35:27.423\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mdim_reduction\u001b[0m:\u001b[36m196\u001b[0m - \u001b[1m--- Reducing dimensions ---\u001b[0m\n", + "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n", + "\u001b[32m2024-08-09 15:35:32.238\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36m_clustering\u001b[0m:\u001b[36m175\u001b[0m - \u001b[1m--- Creating document cluster ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:35:37.431\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m259\u001b[0m - \u001b[1m--- Extract topics ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:35:41.513\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.CEDC\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m284\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n" ] } ], @@ -94,19 +116,9 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/umap/umap_.py:2437: UserWarning:\n", - "\n", - "n_neighbors is larger than the dataset size; truncating to X.shape[0] - 1\n", - "\n" - ] - }, { "data": { "text/html": [ @@ -122,7 +134,7 @@ " " ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -148,33 +160,29 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-08-09 12:34:35.593\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: BBC_News\u001b[0m\n", - "\u001b[32m2024-08-09 12:34:35.670\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:34:35.869\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "\u001b[32m2024-08-09 12:34:35.975\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:34:36.099\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "Preprocessing documents: 100%|██████████| 2225/2225 [00:11<00:00, 198.65it/s]\n", - "\u001b[32m2024-08-09 12:34:47.410\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:34:47.483\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", - "\u001b[32m2024-08-09 12:34:47.709\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", - "\u001b[32m2024-08-09 12:34:47.717\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36m_initialize_datamodule\u001b[0m:\u001b[36m314\u001b[0m - \u001b[1m--- Initializing Datamodule for CTMNeg ---\u001b[0m\n", - "\u001b[32m2024-08-09 12:34:48.183\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36m_initialize_trainer\u001b[0m:\u001b[36m273\u001b[0m - \u001b[1m--- Initializing Trainer for CTMNeg ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:35:45.415\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: BBC_News\u001b[0m\n", + "\u001b[32m2024-08-09 15:35:45.492\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:35:45.691\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 15:35:45.786\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:35:45.926\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "Preprocessing documents: 100%|██████████| 2225/2225 [00:10<00:00, 213.03it/s]\n", + "\u001b[32m2024-08-09 15:35:56.466\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.abstract_helper_models.base\u001b[0m:\u001b[36mprepare_embeddings\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m--- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:35:56.539\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m302\u001b[0m - \u001b[1mDownloading embeddings from github\u001b[0m\n", + "\u001b[32m2024-08-09 15:35:56.851\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m304\u001b[0m - \u001b[1mEmbeddings downloaded successfully at ~/stream_topic_data/\u001b[0m\n", + "\u001b[32m2024-08-09 15:35:56.860\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36m_initialize_datamodule\u001b[0m:\u001b[36m314\u001b[0m - \u001b[1m--- Initializing Datamodule for CTMNeg ---\u001b[0m\n", + "\u001b[32m2024-08-09 15:35:57.069\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36m_initialize_trainer\u001b[0m:\u001b[36m273\u001b[0m - \u001b[1m--- Initializing Trainer for CTMNeg ---\u001b[0m\n", "Trainer already configured with model summary callbacks: []. Skipping setting a default `ModelSummary` callback.\n", "GPU available: True (mps), used: True\n", "TPU available: False, using: 0 TPU cores\n", "HPU available: False, using: 0 HPUs\n", - "\u001b[32m2024-08-09 12:34:48.201\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m457\u001b[0m - \u001b[1m--- Training CTMNeg topic model ---\u001b[0m\n", - "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:652: UserWarning:\n", - "\n", - "Checkpoint directory /Volumes/Research/Repositories/STREAM/docs/notebooks/checkpoints exists and is not empty.\n", - "\n", + "\u001b[32m2024-08-09 15:35:57.094\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m457\u001b[0m - \u001b[1m--- Training CTMNeg topic model ---\u001b[0m\n", "\n", " | Name | Type | Params | Mode \n", "----------------------------------------------------------------------\n", @@ -195,7 +203,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "637189591d0e4df98eece6259647b46b", + "model_id": "5b49956a50d64cf5b1af7a8f3b0f9ce2", "version_major": 2, "version_minor": 0 }, @@ -206,24 +214,10 @@ "metadata": {}, "output_type": "display_data" }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: PossibleUserWarning:\n", - "\n", - "The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=13` in the `DataLoader` to improve performance.\n", - "\n", - "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: PossibleUserWarning:\n", - "\n", - "The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=13` in the `DataLoader` to improve performance.\n", - "\n" - ] - }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a4081cb63c984267a05eb10c8cbd97bc", + "model_id": "34b3faf8118047eabfd88b93e69c455f", "version_major": 2, "version_minor": 0 }, @@ -237,7 +231,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "4cd65f03f3af415699e3a9f81fef9338", + "model_id": "15acc379b85a42049cc4f34b893dff79", "version_major": 2, "version_minor": 0 }, @@ -252,7 +246,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-08-09 12:34:49.580\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m473\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n" + "\u001b[32m2024-08-09 15:35:59.005\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.models.ctmneg\u001b[0m:\u001b[36mfit\u001b[0m:\u001b[36m473\u001b[0m - \u001b[1m--- Training completed successfully. ---\u001b[0m\n" ] } ], @@ -264,6 +258,13 @@ "model = CTMNeg(encoder_dim=64, dropout=0.3)\n", "output = model.fit(dataset, n_topics=5, max_epochs=2)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/stream_topic/__version__.py b/stream_topic/__version__.py index d82b41ac48..a99a72e1f6 100644 --- a/stream_topic/__version__.py +++ b/stream_topic/__version__.py @@ -1,4 +1,4 @@ """Version information.""" # The following line *must* be the last in the module, exactly as formatted: -__version__ = "0.1.4" +__version__ = "0.1.5" diff --git a/stream_topic/models/WordCluTM.py b/stream_topic/models/WordCluTM.py index 932d9339e0..ad6ee2be4e 100644 --- a/stream_topic/models/WordCluTM.py +++ b/stream_topic/models/WordCluTM.py @@ -5,7 +5,7 @@ from gensim.models import Word2Vec from loguru import logger from sklearn.mixture import GaussianMixture - +import os from ..commons.check_steps import check_dataset_steps from ..preprocessor._embedder import BaseEmbedder, GensimBackend from ..utils.dataset import TMDataset @@ -14,6 +14,9 @@ time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") MODEL_NAME = "WordCluTM" # logger.add(f"{MODEL_NAME}_{time}.log", backtrace=True, diagnose=True) +WORD_EMBEDDING_MODEL_NAME = ( + "paraphrase-MiniLM-L3-v2" # use this model for word embeddings for now +) class WordCluTM(BaseModel): @@ -26,23 +29,26 @@ def __init__( umap_args: dict = None, random_state: int = None, gmm_args: dict = None, + train_word_embeddings: bool = True, embeddings_folder_path: str = None, embeddings_file_path: str = None, + word_embedding_model_name: str = WORD_EMBEDDING_MODEL_NAME, save_embeddings: bool = False, **kwargs, ): """ - Initialize the WordCluTM model. + Initializes the WordCluTM model with specified parameters. Args: - num_topics (int): Number of topics. - vector_size (int): Dimensionality of the word vectors. - window (int): Maximum distance between the current and predicted word within a sentence. - min_count (int): Ignores all words with total frequency lower than this. - workers (int): Number of worker threads to train the Word2Vec model. - umap_args (dict): Arguments for UMAP dimensionality reduction. - gmm_args (dict): Arguments for Gaussian Mixture Model (GMM). - random_state (int): Random seed. + umap_args (dict, optional): Parameters for UMAP dimensionality reduction. Defaults to a pre-defined dictionary if not provided. + random_state (int, optional): Seed for random number generation to ensure reproducibility. Defaults to None. + gmm_args (dict, optional): Parameters for Gaussian Mixture Model (GMM) clustering. Defaults to a pre-defined dictionary if not provided. + train_word_embeddings (bool, optional): Flag indicating whether to train Word2Vec embeddings or use pre-trained embeddings. Defaults to True. + embeddings_folder_path (str, optional): Path to the folder where word embeddings should be saved. Defaults to None. + embeddings_file_path (str, optional): Path to the file containing pre-trained word embeddings. Defaults to None. + word_embedding_model_name (str, optional): The name of the pre-trained model to be used for word embeddings. Defaults to 'paraphrase-MiniLM-L3-v2'. + save_embeddings (bool, optional): Flag indicating whether to save the trained word embeddings. Defaults to False. + **kwargs: Additional keyword arguments passed to the BaseModel initialization. """ super().__init__(use_pretrained_embeddings=True, **kwargs) self.save_hyperparameters( @@ -82,12 +88,20 @@ def __init__( if random_state is not None: self.umap_args["random_state"] = random_state - self.embeddings_path = embeddings_folder_path - self.embeddings_file_path = embeddings_file_path - self.save_embeddings = save_embeddings + self.hparams["umap_args"] = self.umap_args + self.hparams["gmm_args"] = self.gmm_args + + self.word_embeddings_path = embeddings_folder_path + self.word_embedding_model_name = word_embedding_model_name + self.word_embeddings_file_path = embeddings_file_path + self.save_word_embeddings = save_embeddings self._status = TrainingStatus.NOT_STARTED + self.word_embeddings_prepared = False + self.train_word_embeddings = train_word_embeddings + self.optimize = False + def get_info(self): """ Get information about the model. @@ -108,7 +122,7 @@ def get_info(self): return info def train_word2vec( - self, sentences, epochs, vector_size, window, min_count, workers + self, sentences, epochs, vector_size, window, min_count, workers, logger ): """ Train a Word2Vec model on the given sentences. @@ -127,14 +141,62 @@ def train_word2vec( # Build the vocabulary from the sentences self.word2vec_model.build_vocab(sentences) + logger.info(f"--- Train Word2Vec ---") # Train the Word2Vec model self.word2vec_model.train( sentences, total_examples=len(sentences), epochs=epochs ) # Initialize BaseEmbedder with GensimBackend - self.base_embedder = BaseEmbedder( - GensimBackend(self.word2vec_model.wv)) + self.base_embedder = BaseEmbedder(GensimBackend(self.word2vec_model.wv)) + + def _prepare_word_embeddings(self, dataset, logger): + """ + Prepare the word embeddings for the dataset. + + Parameters + ---------- + data_module : TMDataModule + The data module used for training. This contains the actually used vocabulary after preprocessing. + dataset : TMDataset + The dataset to be used for training. + logger : Logger + The logger to log messages. + """ + + if dataset.has_word_embeddings(self.word_embedding_model_name): + logger.info( + f"--- Loading precomputed {self.word_embedding_model_name} word embeddings ---" + ) + self.word_embeddings = dataset.get_word_embeddings( + self.word_embedding_model_name, + self.word_embeddings_path, + self.word_embeddings_file_path, + ) + + else: + logger.info( + f"--- Creating {self.word_embedding_model_name} word embeddings ---" + ) + self.word_embeddings = dataset.get_word_embeddings( + model_name=self.word_embedding_model_name, + vocab=dataset.get_vocabulary(), # use the vocabulary from the data module + ) + if ( + self.save_word_embeddings + and self.word_embeddings_path is not None + and not os.path.exists(self.word_embeddings_path) + ): + os.makedirs(self.word_embeddings_path) + if self.save_word_embeddings: + dataset.save_word_embeddings( + word_embeddings=self.word_embeddings, + model_name=self.word_embedding_model_name, + path=self.word_embeddings_path, + file_name=self.word_embeddings_file_path, + ) + + self.word_embeddings_prepared = True def _clustering(self): """ @@ -146,8 +208,7 @@ def _clustering(self): If an error occurs during clustering. """ assert ( - hasattr( - self, "reduced_embeddings") and self.reduced_embeddings is not None + hasattr(self, "reduced_embeddings") and self.reduced_embeddings is not None ), "Reduced embeddings must be generated before clustering." self.gmm_args["n_components"] = self.n_topics @@ -191,33 +252,38 @@ def fit( sentences = dataset.get_corpus() self._status = TrainingStatus.INITIALIZED + unique_words = list(set(word for sentence in sentences for word in sentence)) + try: logger.info(f"--- Training {MODEL_NAME} topic model ---") self._status = TrainingStatus.RUNNING - self.train_word2vec( - sentences=sentences, - epochs=word2vec_epochs, - vector_size=vector_size, - window=window, - min_count=min_count, - workers=workers, - ) # Train Word2Vec model - - logger.info(f"--- Compute word embeddings ---") - unique_words = list( - set(word for sentence in sentences for word in sentence) - ) - word_to_index = {word: i for i, word in enumerate(unique_words)} - self.embeddings = np.array( - [ - ( - self.word2vec_model.wv[word] - if word in self.word2vec_model.wv - else np.zeros(vector_size) - ) - for word in unique_words - ] - ) + if self.train_word_embeddings: + self.train_word2vec( + sentences=sentences, + epochs=word2vec_epochs, + vector_size=vector_size, + window=window, + min_count=min_count, + workers=workers, + logger=logger, + ) # Train Word2Vec model + + self.embeddings = np.array( + [ + ( + self.word2vec_model.wv[word] + if word in dataset.get_vocabulary() + else np.zeros(vector_size) + ) + for word in unique_words + ] + ) + + else: + self._prepare_word_embeddings(dataset, logger) + self.embeddings = np.stack(list(self.word_embeddings.values())) + if self.embeddings[0].shape != self.vector_size: + self.vector_size = self.embeddings[0].shape self.reduced_embeddings = self.dim_reduction(logger) self._clustering() @@ -227,15 +293,19 @@ def fit( logger.info(f"--- Compute doc embeddings ---") for doc in sentences: # Collect word embeddings for the document - word_embeddings = [ - self.word2vec_model.wv[word] - for word in doc - if word in self.word2vec_model.wv - ] + if self.train_word_embeddings: + word_embeddings = [ + self.word2vec_model.wv[word] + for word in doc + if word in self.word2vec_model.wv + ] + else: + word_embeddings = [ + np.array(self.word_embeddings[word]) for word in doc + ] # Compute the mean embedding for the document if there are valid word embeddings if word_embeddings: - self.doc_embeddings.append( - np.mean(word_embeddings, axis=0)) + self.doc_embeddings.append(np.mean(word_embeddings, axis=0)) else: # Append a zero array if no valid word embeddings are found self.doc_embeddings.append(np.zeros(self.vector_size)) @@ -247,11 +317,9 @@ def fit( ] if len(self.doc_embeddings) > 0: # Reduce the dimensionality of the document embedding - reduced_doc_embedding = self.reducer.transform( - self.doc_embeddings) + reduced_doc_embedding = self.reducer.transform(self.doc_embeddings) # Predict the topic distribution for the reduced document embedding - doc_topic_distribution = self.GMM.predict_proba( - reduced_doc_embedding) + doc_topic_distribution = self.GMM.predict_proba(reduced_doc_embedding) # Add the topic distribution to the list doc_topic_distributions.append(doc_topic_distribution[0]) @@ -284,3 +352,85 @@ def fit( def predict(self, texts): pass + + def suggest_hyperparameters(self, trial): + # Suggest UMAP parameters + self.hparams["umap_args"]["n_neighbors"] = trial.suggest_int( + "n_neighbors", 10, 50 + ) + self.hparams["umap_args"]["n_components"] = trial.suggest_int( + "n_components", 5, 50 + ) + self.hparams["umap_args"]["metric"] = trial.suggest_categorical( + "metric", ["cosine", "euclidean"] + ) + + # Suggest GMM parameters + self.hparams["gmm_args"]["covariance_type"] = trial.suggest_categorical( + "covariance_type", ["full", "tied", "diag", "spherical"] + ) + self.hparams["gmm_args"]["tol"] = trial.suggest_float( + "tol", 1e-4, 1e-1, log=True + ) + self.hparams["gmm_args"]["reg_covar"] = trial.suggest_float( + "reg_covar", 1e-6, 1e-3, log=True + ) + self.hparams["gmm_args"]["max_iter"] = trial.suggest_int("max_iter", 100, 1000) + self.hparams["gmm_args"]["n_init"] = trial.suggest_int("n_init", 1, 10) + self.hparams["gmm_args"]["init_params"] = trial.suggest_categorical( + "init_params", ["kmeans", "random"] + ) + + self.umap_args = self.hparams.get("umap_args") + self.gmmargs = self.hparams.get("gmm_args") + + def optimize_and_fit( + self, + dataset, + min_topics=2, + max_topics=20, + criterion="aic", + n_trials=100, + custom_metric=None, + ): + """ + A new method in the child class that optimizes and fits the model. + + Parameters + ---------- + dataset : TMDataset + The dataset to train the model on. + min_topics : int, optional + Minimum number of topics to evaluate, by default 2. + max_topics : int, optional + Maximum number of topics to evaluate, by default 20. + criterion : str, optional + Criterion to use for optimization ('aic', 'bic', or 'custom'), by default 'aic'. + n_trials : int, optional + Number of trials for optimization, by default 100. + custom_metric : object, optional + Custom metric object with a `score` method for evaluation, by default None. + + Returns + ------- + dict + Dictionary containing the best parameters and the optimal number of topics. + """ + best_params = super().optimize_hyperparameters( + dataset=dataset, + min_topics=min_topics, + max_topics=max_topics, + criterion=criterion, + n_trials=n_trials, + custom_metric=custom_metric, + ) + + return best_params + + def calculate_aic(self, n_topics=None): + + return self.GMM.aic(self.reduced_embeddings) + + def calculate_bic(self, n_topics=None): + + return self.GMM.bic(self.reduced_embeddings)