|
38 | 38 | "metadata": {},
|
39 | 39 | "outputs": [],
|
40 | 40 | "source": [
|
| 41 | +<<<<<<< HEAD |
41 | 42 | "from nemo_curator import DomainClassifier, QualityClassifier, get_client\n",
|
| 43 | +======= |
| 44 | + "from dask_cuda import LocalCUDACluster\n", |
| 45 | + "from dask.distributed import Client\n", |
| 46 | + "from nemo_curator import DomainClassifier, QualityClassifier\n", |
| 47 | +>>>>>>> 19692e0 (add dataframe example (#137)) |
42 | 48 | "from nemo_curator.datasets import DocumentDataset\n",
|
43 | 49 | "import cudf\n",
|
44 | 50 | "import dask_cudf"
|
|
124 | 130 | "outputs": [],
|
125 | 131 | "source": [
|
126 | 132 | "if classifier_type == \"DomainClassifier\":\n",
|
| 133 | +<<<<<<< HEAD |
127 | 134 | " classifier = DomainClassifier(batch_size=1024)\n",
|
128 | 135 | "\n",
|
129 | 136 | "elif classifier_type == \"QualityClassifier\":\n",
|
| 137 | +======= |
| 138 | + " domain_labels = [\n", |
| 139 | + " \"Adult\",\n", |
| 140 | + " \"Arts_and_Entertainment\",\n", |
| 141 | + " \"Autos_and_Vehicles\",\n", |
| 142 | + " \"Beauty_and_Fitness\",\n", |
| 143 | + " \"Books_and_Literature\",\n", |
| 144 | + " \"Business_and_Industrial\",\n", |
| 145 | + " \"Computers_and_Electronics\",\n", |
| 146 | + " \"Finance\",\n", |
| 147 | + " \"Food_and_Drink\",\n", |
| 148 | + " \"Games\",\n", |
| 149 | + " \"Health\",\n", |
| 150 | + " \"Hobbies_and_Leisure\",\n", |
| 151 | + " \"Home_and_Garden\",\n", |
| 152 | + " \"Internet_and_Telecom\",\n", |
| 153 | + " \"Jobs_and_Education\",\n", |
| 154 | + " \"Law_and_Government\",\n", |
| 155 | + " \"News\",\n", |
| 156 | + " \"Online_Communities\",\n", |
| 157 | + " \"People_and_Society\",\n", |
| 158 | + " \"Pets_and_Animals\",\n", |
| 159 | + " \"Real_Estate\",\n", |
| 160 | + " \"Science\",\n", |
| 161 | + " \"Sensitive_Subjects\",\n", |
| 162 | + " \"Shopping\",\n", |
| 163 | + " \"Sports\",\n", |
| 164 | + " \"Travel_and_Transportation\",\n", |
| 165 | + " ]\n", |
| 166 | + "\n", |
| 167 | + " classifier = DomainClassifier(\n", |
| 168 | + " model_path=domain_model_path,\n", |
| 169 | + " labels=domain_labels,\n", |
| 170 | + " batch_size=1024,\n", |
| 171 | + " )\n", |
| 172 | + "\n", |
| 173 | + "elif classifier_type == \"QualityClassifier\":\n", |
| 174 | + " quality_labels = [\"High\", \"Medium\", \"Low\"]\n", |
| 175 | + "\n", |
| 176 | +>>>>>>> 19692e0 (add dataframe example (#137)) |
130 | 177 | " classifier = QualityClassifier(\n",
|
131 | 178 | " model_path=quality_model_path,\n",
|
132 | 179 | " batch_size=1024,\n",
|
|
161 | 208 | "name": "stderr",
|
162 | 209 | "output_type": "stream",
|
163 | 210 | "text": [
|
| 211 | +<<<<<<< HEAD |
164 | 212 | "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.23it/s]"
|
| 213 | +======= |
| 214 | + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:02<00:00, 3.62it/s]" |
| 215 | +>>>>>>> 19692e0 (add dataframe example (#137)) |
165 | 216 | ]
|
166 | 217 | },
|
167 | 218 | {
|
168 | 219 | "name": "stdout",
|
169 | 220 | "output_type": "stream",
|
170 | 221 | "text": [
|
171 | 222 | "Writing to disk complete for 1 partitions\n",
|
| 223 | +<<<<<<< HEAD |
172 | 224 | "CPU times: user 4.69 s, sys: 5.13 s, total: 9.82 s\n",
|
173 | 225 | "Wall time: 12.7 s\n"
|
| 226 | +======= |
| 227 | + "CPU times: user 578 ms, sys: 429 ms, total: 1.01 s\n", |
| 228 | + "Wall time: 9.91 s\n" |
| 229 | +>>>>>>> 19692e0 (add dataframe example (#137)) |
174 | 230 | ]
|
175 | 231 | },
|
176 | 232 | {
|
177 | 233 | "name": "stderr",
|
178 | 234 | "output_type": "stream",
|
179 | 235 | "text": [
|
| 236 | +<<<<<<< HEAD |
180 | 237 | "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:04<00:00, 2.07it/s]\n"
|
| 238 | +======= |
| 239 | + "GPU: 0, Part: 0: 100%|██████████| 10/10 [00:03<00:00, 3.30it/s]\n" |
| 240 | +>>>>>>> 19692e0 (add dataframe example (#137)) |
181 | 241 | ]
|
182 | 242 | }
|
183 | 243 | ],
|
|
286 | 346 | "source": [
|
287 | 347 | "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n",
|
288 | 348 | "output_dataset.df.head()"
|
| 349 | +<<<<<<< HEAD |
| 350 | +======= |
| 351 | + ] |
| 352 | + }, |
| 353 | + { |
| 354 | + "cell_type": "markdown", |
| 355 | + "metadata": {}, |
| 356 | + "source": [ |
| 357 | + "# Remove the Output File(s)" |
| 358 | + ] |
| 359 | + }, |
| 360 | + { |
| 361 | + "cell_type": "code", |
| 362 | + "execution_count": 10, |
| 363 | + "metadata": {}, |
| 364 | + "outputs": [], |
| 365 | + "source": [ |
| 366 | + "!rm -rf $output_file_path" |
| 367 | +>>>>>>> 19692e0 (add dataframe example (#137)) |
289 | 368 | ]
|
290 | 369 | }
|
291 | 370 | ],
|
|
0 commit comments