Skip to content

Commit 4fec0f6

Browse files
committed
add dataframe example (NVIDIA#137)
Signed-off-by: Sarah Yurick <[email protected]>
1 parent 7bd69cb commit 4fec0f6

File tree

1 file changed

+79
-0
lines changed

1 file changed

+79
-0
lines changed

tutorials/distributed_data_classification/distributed_data_classification.ipynb

+79
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,13 @@
3838
"metadata": {},
3939
"outputs": [],
4040
"source": [
41+
<<<<<<< HEAD
4142
"from nemo_curator import DomainClassifier, QualityClassifier, get_client\n",
43+
=======
44+
"from dask_cuda import LocalCUDACluster\n",
45+
"from dask.distributed import Client\n",
46+
"from nemo_curator import DomainClassifier, QualityClassifier\n",
47+
>>>>>>> 19692e0 (add dataframe example (#137))
4248
"from nemo_curator.datasets import DocumentDataset\n",
4349
"import cudf\n",
4450
"import dask_cudf"
@@ -124,9 +130,50 @@
124130
"outputs": [],
125131
"source": [
126132
"if classifier_type == \"DomainClassifier\":\n",
133+
<<<<<<< HEAD
127134
" classifier = DomainClassifier(batch_size=1024)\n",
128135
"\n",
129136
"elif classifier_type == \"QualityClassifier\":\n",
137+
=======
138+
" domain_labels = [\n",
139+
" \"Adult\",\n",
140+
" \"Arts_and_Entertainment\",\n",
141+
" \"Autos_and_Vehicles\",\n",
142+
" \"Beauty_and_Fitness\",\n",
143+
" \"Books_and_Literature\",\n",
144+
" \"Business_and_Industrial\",\n",
145+
" \"Computers_and_Electronics\",\n",
146+
" \"Finance\",\n",
147+
" \"Food_and_Drink\",\n",
148+
" \"Games\",\n",
149+
" \"Health\",\n",
150+
" \"Hobbies_and_Leisure\",\n",
151+
" \"Home_and_Garden\",\n",
152+
" \"Internet_and_Telecom\",\n",
153+
" \"Jobs_and_Education\",\n",
154+
" \"Law_and_Government\",\n",
155+
" \"News\",\n",
156+
" \"Online_Communities\",\n",
157+
" \"People_and_Society\",\n",
158+
" \"Pets_and_Animals\",\n",
159+
" \"Real_Estate\",\n",
160+
" \"Science\",\n",
161+
" \"Sensitive_Subjects\",\n",
162+
" \"Shopping\",\n",
163+
" \"Sports\",\n",
164+
" \"Travel_and_Transportation\",\n",
165+
" ]\n",
166+
"\n",
167+
" classifier = DomainClassifier(\n",
168+
" model_path=domain_model_path,\n",
169+
" labels=domain_labels,\n",
170+
" batch_size=1024,\n",
171+
" )\n",
172+
"\n",
173+
"elif classifier_type == \"QualityClassifier\":\n",
174+
" quality_labels = [\"High\", \"Medium\", \"Low\"]\n",
175+
"\n",
176+
>>>>>>> 19692e0 (add dataframe example (#137))
130177
" classifier = QualityClassifier(\n",
131178
" model_path=quality_model_path,\n",
132179
" batch_size=1024,\n",
@@ -161,23 +208,36 @@
161208
"name": "stderr",
162209
"output_type": "stream",
163210
"text": [
211+
<<<<<<< HEAD
164212
"GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.23it/s]"
213+
=======
214+
"GPU: 0, Part: 0: 100%|██████████| 10/10 [00:02<00:00, 3.62it/s]"
215+
>>>>>>> 19692e0 (add dataframe example (#137))
165216
]
166217
},
167218
{
168219
"name": "stdout",
169220
"output_type": "stream",
170221
"text": [
171222
"Writing to disk complete for 1 partitions\n",
223+
<<<<<<< HEAD
172224
"CPU times: user 4.69 s, sys: 5.13 s, total: 9.82 s\n",
173225
"Wall time: 12.7 s\n"
226+
=======
227+
"CPU times: user 578 ms, sys: 429 ms, total: 1.01 s\n",
228+
"Wall time: 9.91 s\n"
229+
>>>>>>> 19692e0 (add dataframe example (#137))
174230
]
175231
},
176232
{
177233
"name": "stderr",
178234
"output_type": "stream",
179235
"text": [
236+
<<<<<<< HEAD
180237
"GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.07it/s]\n"
238+
=======
239+
"GPU: 0, Part: 0: 100%|██████████| 10/10 [00:03<00:00, 3.30it/s]\n"
240+
>>>>>>> 19692e0 (add dataframe example (#137))
181241
]
182242
}
183243
],
@@ -286,6 +346,25 @@
286346
"source": [
287347
"output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n",
288348
"output_dataset.df.head()"
349+
<<<<<<< HEAD
350+
=======
351+
]
352+
},
353+
{
354+
"cell_type": "markdown",
355+
"metadata": {},
356+
"source": [
357+
"# Remove the Output File(s)"
358+
]
359+
},
360+
{
361+
"cell_type": "code",
362+
"execution_count": 10,
363+
"metadata": {},
364+
"outputs": [],
365+
"source": [
366+
"!rm -rf $output_file_path"
367+
>>>>>>> 19692e0 (add dataframe example (#137))
289368
]
290369
}
291370
],

0 commit comments

Comments
 (0)