Skip to content

Commit a7d719d

Browse files
address comments
1 parent a4262f2 commit a7d719d

File tree

3 files changed

+215
-80
lines changed

3 files changed

+215
-80
lines changed

demo.ipynb

Lines changed: 195 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,69 +1,113 @@
11
{
22
"cells": [
33
{
4-
"cell_type": "code",
5-
"execution_count": 11,
6-
"id": "125a1871-6cab-4dc4-9fd5-4e5dbd63ada6",
4+
"cell_type": "markdown",
5+
"id": "19b1960e-9e0a-401f-be15-d343902eaa21",
76
"metadata": {},
8-
"outputs": [],
97
"source": [
10-
"import warnings\n",
11-
"warnings.filterwarnings('ignore')"
8+
"# Spark HuggingFace Connector Demo"
129
]
1310
},
1411
{
15-
"cell_type": "code",
16-
"execution_count": 2,
17-
"id": "38dc7e9e-35fd-4604-9be3-1a1a8749fbcb",
12+
"cell_type": "markdown",
13+
"id": "c9a7bf1d-c208-4873-9e06-5db981f8eeaa",
1814
"metadata": {},
19-
"outputs": [],
2015
"source": [
21-
"from pyspark_huggingface import HuggingFaceDatasets"
16+
"## Create a Spark Session"
2217
]
2318
},
2419
{
2520
"cell_type": "code",
26-
"execution_count": 3,
2721
"id": "620d3ecb-b9cb-480c-b300-69198cce7a9c",
2822
"metadata": {},
23+
"source": [
24+
"from pyspark.sql import SparkSession\n",
25+
"\n",
26+
"spark = SparkSession.builder.getOrCreate()"
27+
],
2928
"outputs": [],
29+
"execution_count": null
30+
},
31+
{
32+
"cell_type": "markdown",
33+
"id": "6f876028-2af5-4e63-8e9d-59afc0959267",
34+
"metadata": {},
3035
"source": [
31-
"from pyspark.sql import SparkSession"
36+
"## Load a dataset as a Spark DataFrame"
3237
]
3338
},
3439
{
3540
"cell_type": "code",
36-
"execution_count": 12,
37-
"id": "9255ffcb-0b61-43dc-b57a-2b8af01a8432",
38-
"metadata": {},
41+
"execution_count": 2,
42+
"id": "b8580bde-3f64-4c71-a087-8b3f71099aee",
43+
"metadata": {
44+
"ExecuteTime": {
45+
"end_time": "2024-11-26T08:54:32.132099Z",
46+
"start_time": "2024-11-26T08:54:28.903653Z"
47+
}
48+
},
3949
"outputs": [],
4050
"source": [
41-
"spark = SparkSession.builder.getOrCreate()"
51+
"df = spark.read.format(\"huggingface\").load(\"rotten_tomatoes\")"
4252
]
4353
},
4454
{
4555
"cell_type": "code",
46-
"id": "7c4501a8-26f4-4f52-9dc8-a70393d567b4",
56+
"execution_count": 4,
57+
"id": "3bbf61d1-4c2c-40e7-9790-2722637aac9d",
4758
"metadata": {},
59+
"outputs": [
60+
{
61+
"name": "stdout",
62+
"output_type": "stream",
63+
"text": [
64+
"root\n",
65+
" |-- text: string (nullable = true)\n",
66+
" |-- label: long (nullable = true)\n",
67+
"\n"
68+
]
69+
}
70+
],
4871
"source": [
49-
"spark.dataSource.register(HuggingFaceDatasets)"
72+
"df.printSchema()"
73+
]
74+
},
75+
{
76+
"cell_type": "code",
77+
"id": "7f7b9a2b-8733-499a-af56-3c51196d060f",
78+
"metadata": {},
79+
"source": [
80+
"# Cache the dataframe to avoid re-downloading data\n",
81+
"df.cache()"
5082
],
5183
"outputs": [],
5284
"execution_count": null
5385
},
5486
{
5587
"cell_type": "code",
56-
"execution_count": 14,
57-
"id": "b8580bde-3f64-4c71-a087-8b3f71099aee",
88+
"execution_count": 12,
89+
"id": "df121dba-2e1e-4206-b2bf-db156c298ee1",
5890
"metadata": {},
59-
"outputs": [],
91+
"outputs": [
92+
{
93+
"data": {
94+
"text/plain": [
95+
"8530"
96+
]
97+
},
98+
"execution_count": 12,
99+
"metadata": {},
100+
"output_type": "execute_result"
101+
}
102+
],
60103
"source": [
61-
"df = spark.read.format(\"huggingface\").load(\"rotten_tomatoes\")"
104+
"# Trigger the cache computation\n",
105+
"df.count()"
62106
]
63107
},
64108
{
65109
"cell_type": "code",
66-
"execution_count": 15,
110+
"execution_count": 13,
67111
"id": "8866bdfb-0782-4430-8b1e-09c65e699f41",
68112
"metadata": {
69113
"editable": true,
@@ -72,72 +116,158 @@
72116
},
73117
"tags": []
74118
},
119+
"outputs": [
120+
{
121+
"data": {
122+
"text/plain": [
123+
"Row(text='the rock is destined to be the 21st century\\'s new \" conan \" and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', label=1)"
124+
]
125+
},
126+
"execution_count": 13,
127+
"metadata": {},
128+
"output_type": "execute_result"
129+
}
130+
],
131+
"source": [
132+
"df.head()"
133+
]
134+
},
135+
{
136+
"cell_type": "code",
137+
"id": "0d9d3112-d19b-4fa8-a6fc-ba40816d1d11",
138+
"metadata": {},
139+
"source": [
140+
"df.show(n=5)"
141+
],
142+
"outputs": [],
143+
"execution_count": null
144+
},
145+
{
146+
"cell_type": "code",
147+
"execution_count": 21,
148+
"id": "225bbbef-4164-424d-a701-c6c74494ef81",
149+
"metadata": {},
150+
"outputs": [
151+
{
152+
"data": {
153+
"text/plain": [
154+
"4265"
155+
]
156+
},
157+
"execution_count": 21,
158+
"metadata": {},
159+
"output_type": "execute_result"
160+
}
161+
],
162+
"source": [
163+
"# Then you can operate on this dataframe\n",
164+
"df.filter(df.label == 0).count()"
165+
]
166+
},
167+
{
168+
"cell_type": "markdown",
169+
"id": "3932f1fd-a324-4f15-86e1-bbe1064d707a",
170+
"metadata": {},
171+
"source": [
172+
"## Load a different split\n",
173+
"You can specify the `split` data source option:"
174+
]
175+
},
176+
{
177+
"cell_type": "code",
178+
"execution_count": 14,
179+
"id": "a16e9270-eb02-4568-8739-db4dc715c274",
180+
"metadata": {},
181+
"outputs": [],
182+
"source": [
183+
"test_df = (\n",
184+
" spark.read.format(\"huggingface\")\n",
185+
" .option(\"split\", \"test\")\n",
186+
" .load(\"rotten_tomatoes\")\n",
187+
")"
188+
]
189+
},
190+
{
191+
"cell_type": "code",
192+
"execution_count": 15,
193+
"id": "3aec5719-c3a1-4d18-92c8-2b0c2f4bb939",
194+
"metadata": {},
195+
"outputs": [
196+
{
197+
"data": {
198+
"text/plain": [
199+
"DataFrame[text: string, label: bigint]"
200+
]
201+
},
202+
"execution_count": 15,
203+
"metadata": {},
204+
"output_type": "execute_result"
205+
}
206+
],
207+
"source": [
208+
"test_df.cache()"
209+
]
210+
},
211+
{
212+
"cell_type": "code",
213+
"execution_count": 16,
214+
"id": "d605289d-361d-4a6c-9b70-f7ccdff3aa9d",
215+
"metadata": {},
75216
"outputs": [
76217
{
77218
"name": "stderr",
78219
"output_type": "stream",
79220
"text": [
80-
"[Stage 5:> (0 + 1) / 1]"
221+
" "
81222
]
82223
},
224+
{
225+
"data": {
226+
"text/plain": [
227+
"1066"
228+
]
229+
},
230+
"execution_count": 16,
231+
"metadata": {},
232+
"output_type": "execute_result"
233+
}
234+
],
235+
"source": [
236+
"test_df.count()"
237+
]
238+
},
239+
{
240+
"cell_type": "code",
241+
"execution_count": 18,
242+
"id": "df1ad003-1476-4557-811b-31c3888c0030",
243+
"metadata": {},
244+
"outputs": [
83245
{
84246
"name": "stdout",
85247
"output_type": "stream",
86248
"text": [
87249
"+--------------------+-----+\n",
88250
"| text|label|\n",
89251
"+--------------------+-----+\n",
90-
"|the rock is desti...| 1|\n",
91-
"|the gorgeously el...| 1|\n",
92-
"|effective but too...| 1|\n",
93-
"|if you sometimes ...| 1|\n",
94-
"|emerges as someth...| 1|\n",
95-
"|the film provides...| 1|\n",
96-
"|offers that rare ...| 1|\n",
97-
"|perhaps no pictur...| 1|\n",
98-
"|steers turns in a...| 1|\n",
99-
"|take care of my c...| 1|\n",
100-
"|this is a film we...| 1|\n",
101-
"|what really surpr...| 1|\n",
102-
"|( wendigo is ) wh...| 1|\n",
103-
"|one of the greate...| 1|\n",
104-
"|ultimately , it p...| 1|\n",
105-
"|an utterly compel...| 1|\n",
106-
"|illuminating if o...| 1|\n",
107-
"|a masterpiece fou...| 1|\n",
108-
"|the movie's ripe ...| 1|\n",
109-
"|offers a breath o...| 1|\n",
252+
"|lovingly photogra...| 1|\n",
253+
"|consistently clev...| 1|\n",
254+
"|it's like a \" big...| 1|\n",
255+
"|the story gives a...| 1|\n",
256+
"|red dragon \" neve...| 1|\n",
110257
"+--------------------+-----+\n",
111-
"only showing top 20 rows\n",
258+
"only showing top 5 rows\n",
112259
"\n"
113260
]
114-
},
115-
{
116-
"name": "stderr",
117-
"output_type": "stream",
118-
"text": [
119-
" "
120-
]
121261
}
122262
],
123263
"source": [
124-
"df.show()"
264+
"test_df.show(n=5)"
125265
]
126266
},
127-
{
128-
"cell_type": "code",
129-
"id": "873bb4fc-1424-4816-b835-6c2b839d3de4",
130-
"metadata": {},
131-
"source": [
132-
"df.count()"
133-
],
134-
"outputs": [],
135-
"execution_count": null
136-
},
137267
{
138268
"cell_type": "code",
139269
"execution_count": null,
140-
"id": "4a1b895f-fe20-4520-a90d-b17df8e691e4",
270+
"id": "a7f14b91-059e-4894-83d2-4ed74e0adaf9",
141271
"metadata": {},
142272
"outputs": [],
143273
"source": []

pyspark_huggingface/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
from pyspark_huggingface.huggingface import HuggingFaceDatasets
1+
from pyspark_huggingface.huggingface import HuggingFaceDatasets as DefaultSource

0 commit comments

Comments
 (0)