add dataframe example (NVIDIA#137)

sarahyurick · sarahyurick · commit 4fec0f633fa2 · 2024-07-23T12:07:40.000-07:00
Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;
diff --git a/tutorials/distributed_data_classification/distributed_data_classification.ipynb b/tutorials/distributed_data_classification/distributed_data_classification.ipynb
@@ -38,7 +38,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
+<<<<<<< HEAD
     "from nemo_curator import DomainClassifier, QualityClassifier, get_client\n",
+=======
+    "from dask_cuda import LocalCUDACluster\n",
+    "from dask.distributed import Client\n",
+    "from nemo_curator import DomainClassifier, QualityClassifier\n",
+>>>>>>> 19692e0 (add dataframe example (#137))
     "from nemo_curator.datasets import DocumentDataset\n",
     "import cudf\n",
     "import dask_cudf"
@@ -124,9 +130,50 @@
    "outputs": [],
    "source": [
     "if classifier_type == \"DomainClassifier\":\n",
+<<<<<<< HEAD
     "    classifier = DomainClassifier(batch_size=1024)\n",
     "\n",
     "elif classifier_type == \"QualityClassifier\":\n",
+=======
+    "    domain_labels = [\n",
+    "        \"Adult\",\n",
+    "        \"Arts_and_Entertainment\",\n",
+    "        \"Autos_and_Vehicles\",\n",
+    "        \"Beauty_and_Fitness\",\n",
+    "        \"Books_and_Literature\",\n",
+    "        \"Business_and_Industrial\",\n",
+    "        \"Computers_and_Electronics\",\n",
+    "        \"Finance\",\n",
+    "        \"Food_and_Drink\",\n",
+    "        \"Games\",\n",
+    "        \"Health\",\n",
+    "        \"Hobbies_and_Leisure\",\n",
+    "        \"Home_and_Garden\",\n",
+    "        \"Internet_and_Telecom\",\n",
+    "        \"Jobs_and_Education\",\n",
+    "        \"Law_and_Government\",\n",
+    "        \"News\",\n",
+    "        \"Online_Communities\",\n",
+    "        \"People_and_Society\",\n",
+    "        \"Pets_and_Animals\",\n",
+    "        \"Real_Estate\",\n",
+    "        \"Science\",\n",
+    "        \"Sensitive_Subjects\",\n",
+    "        \"Shopping\",\n",
+    "        \"Sports\",\n",
+    "        \"Travel_and_Transportation\",\n",
+    "    ]\n",
+    "\n",
+    "    classifier = DomainClassifier(\n",
+    "        model_path=domain_model_path,\n",
+    "        labels=domain_labels,\n",
+    "        batch_size=1024,\n",
+    "    )\n",
+    "\n",
+    "elif classifier_type == \"QualityClassifier\":\n",
+    "    quality_labels = [\"High\", \"Medium\", \"Low\"]\n",
+    "\n",
+>>>>>>> 19692e0 (add dataframe example (#137))
     "    classifier = QualityClassifier(\n",
     "        model_path=quality_model_path,\n",
     "        batch_size=1024,\n",
@@ -161,23 +208,36 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
+<<<<<<< HEAD
       "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00,  2.23it/s]"
+=======
+      "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:02<00:00,  3.62it/s]"
+>>>>>>> 19692e0 (add dataframe example (#137))
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "Writing to disk complete for 1 partitions\n",
+<<<<<<< HEAD
       "CPU times: user 4.69 s, sys: 5.13 s, total: 9.82 s\n",
       "Wall time: 12.7 s\n"
+=======
+      "CPU times: user 578 ms, sys: 429 ms, total: 1.01 s\n",
+      "Wall time: 9.91 s\n"
+>>>>>>> 19692e0 (add dataframe example (#137))
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+<<<<<<< HEAD
       "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00,  2.07it/s]\n"
+=======
+      "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:03<00:00,  3.30it/s]\n"
+>>>>>>> 19692e0 (add dataframe example (#137))
      ]
     }
    ],
@@ -286,6 +346,25 @@
    "source": [
     "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n",
     "output_dataset.df.head()"
+<<<<<<< HEAD
+=======
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Remove the Output File(s)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm -rf $output_file_path"
+>>>>>>> 19692e0 (add dataframe example (#137))
    ]
   }
  ],