Merge pull request #79 from AnFreTh/main

Release v0.1.4
AnFreTh · Aug 9, 2024 · 39fb5d7 · 39fb5d7
2 parents 3e94e5d + a04931d
commit 39fb5d7
Show file tree

Hide file tree

Showing 36 changed files with 859 additions and 766 deletions.
diff --git a/.gitignore b/.gitignore
@@ -183,4 +183,11 @@ post-checkout
 post-commit
 post-merge
 pre-push
-docs/notebooks/lightning_logs/*
+docs/notebooks/lightning_logs/*
+docs/notebooks/lightning_logs
+docs/notebooks/data
+docs/notebooks/data/*
+docs/notebooks/embeddings
+docs/notebooks/embeddings/*
+docs/notebooks/checkpoints
+docs/notebooks/checkpoints/*
diff --git a/docs/notebooks/datasets.ipynb b/docs/notebooks/datasets.ipynb
@@ -4,8 +4,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/develop/docs/notebooks/datasets.ipynb)\n",
-    "[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/develop/docs/notebooks/datasets.ipynb)\n",
+    "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/main/docs/notebooks/datasets.ipynb)\n",
+    "[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/main/docs/notebooks/datasets.ipynb)\n",
     "\n",
     "# Datasets"
    ]
@@ -33,13 +33,25 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "# uncomment the below line if running in Colab\n",
+    "# package neeeds to be installed for the notebook to run\n",
+    "\n",
+    "# ! pip install -U stream_topic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/opt/homebrew/Caskroom/miniforge/base/envs/db/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
-      "  warnings.warn(\n"
+      "/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
+      "  from tqdm.autonotebook import tqdm, trange\n"
      ]
     }
    ],
@@ -60,37 +72,6 @@
     "- these datasets are included in the package and can be loaded using the `TMDataset` module"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['Stocktwits_GME_large',\n",
-       " 'BBC_News',\n",
-       " 'Stocktwits_GME',\n",
-       " 'Reddit_GME',\n",
-       " 'Reuters',\n",
-       " 'Spotify',\n",
-       " '20NewsGroups',\n",
-       " 'DummyDataset',\n",
-       " 'Spotify_most_popular',\n",
-       " 'Poliblogs',\n",
-       " 'Spotify_least_popular']"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dataset = TMDataset()\n",
-    "dataset.get_dataset_list()"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 3,
@@ -100,12 +81,16 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2024-08-07 10:31:30.489\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m159\u001b[0m - \u001b[1mFetching dataset: Reuters\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:31:31.978\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mDataset loaded successfully from /opt/homebrew/Caskroom/miniforge/base/envs/db/lib/python3.10/site-packages/stream_topic/preprocessed_datasets/Reuters\u001b[0m\n"
+      "\u001b[32m2024-08-09 12:13:26.847\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: Reuters\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:26.914\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:27.147\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:27.313\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:27.456\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n"
      ]
     }
    ],
    "source": [
+    "dataset = TMDataset()\n",
     "dataset.fetch_dataset(name=\"Reuters\")"
    ]
   },
@@ -181,10 +166,13 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2024-08-07 10:31:33.085\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m151\u001b[0m - \u001b[1mDataset name already provided while instantiating the class: Reuters\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:31:33.086\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mOverwriting the dataset name with the provided name in fetch_dataset: Spotify\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:31:33.086\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m156\u001b[0m - \u001b[1mFetching dataset: Spotify\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:31:33.190\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mDataset loaded successfully from /opt/homebrew/Caskroom/miniforge/base/envs/db/lib/python3.10/site-packages/stream_topic/preprocessed_datasets/Spotify\u001b[0m\n"
+      "\u001b[32m2024-08-09 12:13:28.464\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m108\u001b[0m - \u001b[1mDataset name already provided while instantiating the class: Reuters\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.464\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m111\u001b[0m - \u001b[1mOverwriting the dataset name with the name provided in fetch_dataset: Spotify\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.465\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m115\u001b[0m - \u001b[1mFetching dataset: Spotify\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.539\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.749\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:28.923\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:29.058\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n"
      ]
     }
    ],
@@ -451,7 +439,7 @@
     {
      "data": {
       "text/plain": [
-       "[75, 58, 37, 45, 41]"
+       "[75, 58]"
       ]
      },
      "execution_count": 11,
@@ -460,7 +448,7 @@
     }
    ],
    "source": [
-    "dataset.labels[:5]"
+    "dataset.labels[:2]"
    ]
   },
   {
@@ -475,18 +463,6 @@
    "execution_count": 12,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "from stream_topic.utils import TMDataset\n",
-    "\n",
-    "import warnings\n",
-    "warnings.filterwarnings(\"ignore\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "import pandas as pd\n",
     "import numpy as np\n",
@@ -508,19 +484,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Preprocessing documents: 100%|██████████| 1000/1000 [00:03<00:00, 267.71it/s]\n",
-      "\u001b[32m2024-08-07 10:31:37.027\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m409\u001b[0m - \u001b[1mDataset save directory does not exist: data/\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:31:37.027\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m410\u001b[0m - \u001b[1mCreating directory: data/\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:31:37.031\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mDataset saved to data/sample_data.parquet\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:31:37.032\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m430\u001b[0m - \u001b[1mDataset info saved to data/sample_data_info.pkl\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:31:37.032\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mDataset name appended to avaliable datasets list: ['Stocktwits_GME_large', 'BBC_News', 'Stocktwits_GME', 'Reddit_GME', 'Reuters', 'Spotify', '20NewsGroups', 'DummyDataset', 'Spotify_most_popular', 'Poliblogs', 'Spotify_least_popular', 'sample_data']\u001b[0m\n"
+      "Preprocessing documents: 100%|██████████| 1000/1000 [00:03<00:00, 263.32it/s]\n",
+      "\u001b[32m2024-08-09 12:13:32.967\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m237\u001b[0m - \u001b[1mDataset saved to data/sample_data.parquet\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:32.968\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mDataset info saved to data/sample_data_info.pkl\u001b[0m\n"
      ]
     }
    ],
@@ -537,27 +510,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2024-08-07 10:31:37.036\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m159\u001b[0m - \u001b[1mFetching dataset: sample_data\u001b[0m\n",
-      "\u001b[32m2024-08-07 10:31:37.045\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mDataset loaded successfully from data/\u001b[0m\n"
+      "\u001b[32m2024-08-09 12:13:32.972\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: sample_data\u001b[0m\n",
+      "\u001b[32m2024-08-09 12:13:32.973\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m128\u001b[0m - \u001b[1mFetching dataset from local path\u001b[0m\n"
      ]
     }
    ],
    "source": [
     "# the new data is saved in the data folder unlike the default datasets which are saved in package directory under preprocessed_data folder.\n",
     "# therefore, you need to provide the path to the data folder to fetch the dataset\n",
-    "dataset.fetch_dataset(name=\"sample_data\", dataset_path=\"data/\")"
+    "dataset.fetch_dataset(name=\"sample_data\", dataset_path=\"data/\", source=\"local\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -630,7 +603,7 @@
        "4  BGHXO      3  [BGHXO]"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -639,6 +612,13 @@
     "dataset.dataframe.head()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -663,7 +643,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.0"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,

diff --git a/docs/notebooks/datasets.md b/docs/notebooks/datasets.md