|  | 
| 1 | 1 | { | 
| 2 | 2 |  "cells": [ | 
| 3 | 3 |   { | 
| 4 |  | -   "cell_type": "code", | 
| 5 |  | -   "execution_count": 11, | 
| 6 |  | -   "id": "125a1871-6cab-4dc4-9fd5-4e5dbd63ada6", | 
|  | 4 | +   "cell_type": "markdown", | 
|  | 5 | +   "id": "19b1960e-9e0a-401f-be15-d343902eaa21", | 
| 7 | 6 |    "metadata": {}, | 
| 8 |  | -   "outputs": [], | 
| 9 | 7 |    "source": [ | 
| 10 |  | -    "import warnings\n", | 
| 11 |  | -    "warnings.filterwarnings('ignore')" | 
|  | 8 | +    "# Spark HuggingFace Connector Demo" | 
| 12 | 9 |    ] | 
| 13 | 10 |   }, | 
| 14 | 11 |   { | 
| 15 |  | -   "cell_type": "code", | 
| 16 |  | -   "execution_count": 2, | 
| 17 |  | -   "id": "38dc7e9e-35fd-4604-9be3-1a1a8749fbcb", | 
|  | 12 | +   "cell_type": "markdown", | 
|  | 13 | +   "id": "c9a7bf1d-c208-4873-9e06-5db981f8eeaa", | 
| 18 | 14 |    "metadata": {}, | 
| 19 |  | -   "outputs": [], | 
| 20 | 15 |    "source": [ | 
| 21 |  | -    "from pyspark_huggingface import HuggingFaceDatasets" | 
|  | 16 | +    "## Create a Spark Session" | 
| 22 | 17 |    ] | 
| 23 | 18 |   }, | 
| 24 | 19 |   { | 
| 25 | 20 |    "cell_type": "code", | 
| 26 |  | -   "execution_count": 3, | 
| 27 | 21 |    "id": "620d3ecb-b9cb-480c-b300-69198cce7a9c", | 
| 28 | 22 |    "metadata": {}, | 
|  | 23 | +   "source": [ | 
|  | 24 | +    "from pyspark.sql import SparkSession\n", | 
|  | 25 | +    "\n", | 
|  | 26 | +    "spark = SparkSession.builder.getOrCreate()" | 
|  | 27 | +   ], | 
| 29 | 28 |    "outputs": [], | 
|  | 29 | +   "execution_count": null | 
|  | 30 | +  }, | 
|  | 31 | +  { | 
|  | 32 | +   "cell_type": "markdown", | 
|  | 33 | +   "id": "6f876028-2af5-4e63-8e9d-59afc0959267", | 
|  | 34 | +   "metadata": {}, | 
| 30 | 35 |    "source": [ | 
| 31 |  | -    "from pyspark.sql import SparkSession" | 
|  | 36 | +    "## Load a dataset as a Spark DataFrame" | 
| 32 | 37 |    ] | 
| 33 | 38 |   }, | 
| 34 | 39 |   { | 
| 35 | 40 |    "cell_type": "code", | 
| 36 |  | -   "execution_count": 12, | 
| 37 |  | -   "id": "9255ffcb-0b61-43dc-b57a-2b8af01a8432", | 
| 38 |  | -   "metadata": {}, | 
|  | 41 | +   "execution_count": 2, | 
|  | 42 | +   "id": "b8580bde-3f64-4c71-a087-8b3f71099aee", | 
|  | 43 | +   "metadata": { | 
|  | 44 | +    "ExecuteTime": { | 
|  | 45 | +     "end_time": "2024-11-26T08:54:32.132099Z", | 
|  | 46 | +     "start_time": "2024-11-26T08:54:28.903653Z" | 
|  | 47 | +    } | 
|  | 48 | +   }, | 
| 39 | 49 |    "outputs": [], | 
| 40 | 50 |    "source": [ | 
| 41 |  | -    "spark = SparkSession.builder.getOrCreate()" | 
|  | 51 | +    "df = spark.read.format(\"huggingface\").load(\"rotten_tomatoes\")" | 
| 42 | 52 |    ] | 
| 43 | 53 |   }, | 
| 44 | 54 |   { | 
| 45 | 55 |    "cell_type": "code", | 
| 46 |  | -   "id": "7c4501a8-26f4-4f52-9dc8-a70393d567b4", | 
|  | 56 | +   "execution_count": 4, | 
|  | 57 | +   "id": "3bbf61d1-4c2c-40e7-9790-2722637aac9d", | 
| 47 | 58 |    "metadata": {}, | 
|  | 59 | +   "outputs": [ | 
|  | 60 | +    { | 
|  | 61 | +     "name": "stdout", | 
|  | 62 | +     "output_type": "stream", | 
|  | 63 | +     "text": [ | 
|  | 64 | +      "root\n", | 
|  | 65 | +      " |-- text: string (nullable = true)\n", | 
|  | 66 | +      " |-- label: long (nullable = true)\n", | 
|  | 67 | +      "\n" | 
|  | 68 | +     ] | 
|  | 69 | +    } | 
|  | 70 | +   ], | 
| 48 | 71 |    "source": [ | 
| 49 |  | -    "spark.dataSource.register(HuggingFaceDatasets)" | 
|  | 72 | +    "df.printSchema()" | 
|  | 73 | +   ] | 
|  | 74 | +  }, | 
|  | 75 | +  { | 
|  | 76 | +   "cell_type": "code", | 
|  | 77 | +   "id": "7f7b9a2b-8733-499a-af56-3c51196d060f", | 
|  | 78 | +   "metadata": {}, | 
|  | 79 | +   "source": [ | 
|  | 80 | +    "# Cache the dataframe to avoid re-downloading data\n", | 
|  | 81 | +    "df.cache()" | 
| 50 | 82 |    ], | 
| 51 | 83 |    "outputs": [], | 
| 52 | 84 |    "execution_count": null | 
| 53 | 85 |   }, | 
| 54 | 86 |   { | 
| 55 | 87 |    "cell_type": "code", | 
| 56 |  | -   "execution_count": 14, | 
| 57 |  | -   "id": "b8580bde-3f64-4c71-a087-8b3f71099aee", | 
|  | 88 | +   "execution_count": 12, | 
|  | 89 | +   "id": "df121dba-2e1e-4206-b2bf-db156c298ee1", | 
| 58 | 90 |    "metadata": {}, | 
| 59 |  | -   "outputs": [], | 
|  | 91 | +   "outputs": [ | 
|  | 92 | +    { | 
|  | 93 | +     "data": { | 
|  | 94 | +      "text/plain": [ | 
|  | 95 | +       "8530" | 
|  | 96 | +      ] | 
|  | 97 | +     }, | 
|  | 98 | +     "execution_count": 12, | 
|  | 99 | +     "metadata": {}, | 
|  | 100 | +     "output_type": "execute_result" | 
|  | 101 | +    } | 
|  | 102 | +   ], | 
| 60 | 103 |    "source": [ | 
| 61 |  | -    "df = spark.read.format(\"huggingface\").load(\"rotten_tomatoes\")" | 
|  | 104 | +    "# Trigger the cache computation\n", | 
|  | 105 | +    "df.count()" | 
| 62 | 106 |    ] | 
| 63 | 107 |   }, | 
| 64 | 108 |   { | 
| 65 | 109 |    "cell_type": "code", | 
| 66 |  | -   "execution_count": 15, | 
|  | 110 | +   "execution_count": 13, | 
| 67 | 111 |    "id": "8866bdfb-0782-4430-8b1e-09c65e699f41", | 
| 68 | 112 |    "metadata": { | 
| 69 | 113 |     "editable": true, | 
|  | 
| 72 | 116 |     }, | 
| 73 | 117 |     "tags": [] | 
| 74 | 118 |    }, | 
|  | 119 | +   "outputs": [ | 
|  | 120 | +    { | 
|  | 121 | +     "data": { | 
|  | 122 | +      "text/plain": [ | 
|  | 123 | +       "Row(text='the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', label=1)" | 
|  | 124 | +      ] | 
|  | 125 | +     }, | 
|  | 126 | +     "execution_count": 13, | 
|  | 127 | +     "metadata": {}, | 
|  | 128 | +     "output_type": "execute_result" | 
|  | 129 | +    } | 
|  | 130 | +   ], | 
|  | 131 | +   "source": [ | 
|  | 132 | +    "df.head()" | 
|  | 133 | +   ] | 
|  | 134 | +  }, | 
|  | 135 | +  { | 
|  | 136 | +   "cell_type": "code", | 
|  | 137 | +   "id": "0d9d3112-d19b-4fa8-a6fc-ba40816d1d11", | 
|  | 138 | +   "metadata": {}, | 
|  | 139 | +   "source": [ | 
|  | 140 | +    "df.show(n=5)" | 
|  | 141 | +   ], | 
|  | 142 | +   "outputs": [], | 
|  | 143 | +   "execution_count": null | 
|  | 144 | +  }, | 
|  | 145 | +  { | 
|  | 146 | +   "cell_type": "code", | 
|  | 147 | +   "execution_count": 21, | 
|  | 148 | +   "id": "225bbbef-4164-424d-a701-c6c74494ef81", | 
|  | 149 | +   "metadata": {}, | 
|  | 150 | +   "outputs": [ | 
|  | 151 | +    { | 
|  | 152 | +     "data": { | 
|  | 153 | +      "text/plain": [ | 
|  | 154 | +       "4265" | 
|  | 155 | +      ] | 
|  | 156 | +     }, | 
|  | 157 | +     "execution_count": 21, | 
|  | 158 | +     "metadata": {}, | 
|  | 159 | +     "output_type": "execute_result" | 
|  | 160 | +    } | 
|  | 161 | +   ], | 
|  | 162 | +   "source": [ | 
|  | 163 | +    "# Then you can operate on this dataframe\n", | 
|  | 164 | +    "df.filter(df.label == 0).count()" | 
|  | 165 | +   ] | 
|  | 166 | +  }, | 
|  | 167 | +  { | 
|  | 168 | +   "cell_type": "markdown", | 
|  | 169 | +   "id": "3932f1fd-a324-4f15-86e1-bbe1064d707a", | 
|  | 170 | +   "metadata": {}, | 
|  | 171 | +   "source": [ | 
|  | 172 | +    "## Load a different split\n", | 
|  | 173 | +    "You can specify the `split` data source option:" | 
|  | 174 | +   ] | 
|  | 175 | +  }, | 
|  | 176 | +  { | 
|  | 177 | +   "cell_type": "code", | 
|  | 178 | +   "execution_count": 14, | 
|  | 179 | +   "id": "a16e9270-eb02-4568-8739-db4dc715c274", | 
|  | 180 | +   "metadata": {}, | 
|  | 181 | +   "outputs": [], | 
|  | 182 | +   "source": [ | 
|  | 183 | +    "test_df = (\n", | 
|  | 184 | +    "    spark.read.format(\"huggingface\")\n", | 
|  | 185 | +    "    .option(\"split\", \"test\")\n", | 
|  | 186 | +    "    .load(\"rotten_tomatoes\")\n", | 
|  | 187 | +    ")" | 
|  | 188 | +   ] | 
|  | 189 | +  }, | 
|  | 190 | +  { | 
|  | 191 | +   "cell_type": "code", | 
|  | 192 | +   "execution_count": 15, | 
|  | 193 | +   "id": "3aec5719-c3a1-4d18-92c8-2b0c2f4bb939", | 
|  | 194 | +   "metadata": {}, | 
|  | 195 | +   "outputs": [ | 
|  | 196 | +    { | 
|  | 197 | +     "data": { | 
|  | 198 | +      "text/plain": [ | 
|  | 199 | +       "DataFrame[text: string, label: bigint]" | 
|  | 200 | +      ] | 
|  | 201 | +     }, | 
|  | 202 | +     "execution_count": 15, | 
|  | 203 | +     "metadata": {}, | 
|  | 204 | +     "output_type": "execute_result" | 
|  | 205 | +    } | 
|  | 206 | +   ], | 
|  | 207 | +   "source": [ | 
|  | 208 | +    "test_df.cache()" | 
|  | 209 | +   ] | 
|  | 210 | +  }, | 
|  | 211 | +  { | 
|  | 212 | +   "cell_type": "code", | 
|  | 213 | +   "execution_count": 16, | 
|  | 214 | +   "id": "d605289d-361d-4a6c-9b70-f7ccdff3aa9d", | 
|  | 215 | +   "metadata": {}, | 
| 75 | 216 |    "outputs": [ | 
| 76 | 217 |     { | 
| 77 | 218 |      "name": "stderr", | 
| 78 | 219 |      "output_type": "stream", | 
| 79 | 220 |      "text": [ | 
| 80 |  | -      "[Stage 5:>                                                          (0 + 1) / 1]" | 
|  | 221 | +      "                                                                                " | 
| 81 | 222 |      ] | 
| 82 | 223 |     }, | 
|  | 224 | +    { | 
|  | 225 | +     "data": { | 
|  | 226 | +      "text/plain": [ | 
|  | 227 | +       "1066" | 
|  | 228 | +      ] | 
|  | 229 | +     }, | 
|  | 230 | +     "execution_count": 16, | 
|  | 231 | +     "metadata": {}, | 
|  | 232 | +     "output_type": "execute_result" | 
|  | 233 | +    } | 
|  | 234 | +   ], | 
|  | 235 | +   "source": [ | 
|  | 236 | +    "test_df.count()" | 
|  | 237 | +   ] | 
|  | 238 | +  }, | 
|  | 239 | +  { | 
|  | 240 | +   "cell_type": "code", | 
|  | 241 | +   "execution_count": 18, | 
|  | 242 | +   "id": "df1ad003-1476-4557-811b-31c3888c0030", | 
|  | 243 | +   "metadata": {}, | 
|  | 244 | +   "outputs": [ | 
| 83 | 245 |     { | 
| 84 | 246 |      "name": "stdout", | 
| 85 | 247 |      "output_type": "stream", | 
| 86 | 248 |      "text": [ | 
| 87 | 249 |       "+--------------------+-----+\n", | 
| 88 | 250 |       "|                text|label|\n", | 
| 89 | 251 |       "+--------------------+-----+\n", | 
| 90 |  | -      "|the rock is desti...|    1|\n", | 
| 91 |  | -      "|the gorgeously el...|    1|\n", | 
| 92 |  | -      "|effective but too...|    1|\n", | 
| 93 |  | -      "|if you sometimes ...|    1|\n", | 
| 94 |  | -      "|emerges as someth...|    1|\n", | 
| 95 |  | -      "|the film provides...|    1|\n", | 
| 96 |  | -      "|offers that rare ...|    1|\n", | 
| 97 |  | -      "|perhaps no pictur...|    1|\n", | 
| 98 |  | -      "|steers turns in a...|    1|\n", | 
| 99 |  | -      "|take care of my c...|    1|\n", | 
| 100 |  | -      "|this is a film we...|    1|\n", | 
| 101 |  | -      "|what really surpr...|    1|\n", | 
| 102 |  | -      "|( wendigo is ) wh...|    1|\n", | 
| 103 |  | -      "|one of the greate...|    1|\n", | 
| 104 |  | -      "|ultimately , it p...|    1|\n", | 
| 105 |  | -      "|an utterly compel...|    1|\n", | 
| 106 |  | -      "|illuminating if o...|    1|\n", | 
| 107 |  | -      "|a masterpiece fou...|    1|\n", | 
| 108 |  | -      "|the movie's ripe ...|    1|\n", | 
| 109 |  | -      "|offers a breath o...|    1|\n", | 
|  | 252 | +      "|lovingly photogra...|    1|\n", | 
|  | 253 | +      "|consistently clev...|    1|\n", | 
|  | 254 | +      "|it's like a \" big...|    1|\n", | 
|  | 255 | +      "|the story gives a...|    1|\n", | 
|  | 256 | +      "|red dragon \" neve...|    1|\n", | 
| 110 | 257 |       "+--------------------+-----+\n", | 
| 111 |  | -      "only showing top 20 rows\n", | 
|  | 258 | +      "only showing top 5 rows\n", | 
| 112 | 259 |       "\n" | 
| 113 | 260 |      ] | 
| 114 |  | -    }, | 
| 115 |  | -    { | 
| 116 |  | -     "name": "stderr", | 
| 117 |  | -     "output_type": "stream", | 
| 118 |  | -     "text": [ | 
| 119 |  | -      "                                                                                " | 
| 120 |  | -     ] | 
| 121 | 261 |     } | 
| 122 | 262 |    ], | 
| 123 | 263 |    "source": [ | 
| 124 |  | -    "df.show()" | 
|  | 264 | +    "test_df.show(n=5)" | 
| 125 | 265 |    ] | 
| 126 | 266 |   }, | 
| 127 |  | -  { | 
| 128 |  | -   "cell_type": "code", | 
| 129 |  | -   "id": "873bb4fc-1424-4816-b835-6c2b839d3de4", | 
| 130 |  | -   "metadata": {}, | 
| 131 |  | -   "source": [ | 
| 132 |  | -    "df.count()" | 
| 133 |  | -   ], | 
| 134 |  | -   "outputs": [], | 
| 135 |  | -   "execution_count": null | 
| 136 |  | -  }, | 
| 137 | 267 |   { | 
| 138 | 268 |    "cell_type": "code", | 
| 139 | 269 |    "execution_count": null, | 
| 140 |  | -   "id": "4a1b895f-fe20-4520-a90d-b17df8e691e4", | 
|  | 270 | +   "id": "a7f14b91-059e-4894-83d2-4ed74e0adaf9", | 
| 141 | 271 |    "metadata": {}, | 
| 142 | 272 |    "outputs": [], | 
| 143 | 273 |    "source": [] | 
|  | 
0 commit comments