Skip to content

Commit a78d6ce

Browse files
sameeragarwaldavies
authored andcommitted
[SPARK-15078] [SQL] Add all TPCDS 1.4 benchmark queries for SparkSQL
## What changes were proposed in this pull request? Now that SparkSQL supports all TPC-DS queries, this patch adds all 99 benchmark queries inside SparkSQL. ## How was this patch tested? Benchmark only Author: Sameer Agarwal <[email protected]> Closes #13188 from sameeragarwal/tpcds-all.
1 parent dcac8e6 commit a78d6ce

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

106 files changed

+4858
-1226
lines changed

dev/.rat-excludes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,3 +98,4 @@ spark-deps-.*
9898
.*csv
9999
.*tsv
100100
org.apache.spark.scheduler.ExternalClusterManager
101+
.*\.sql
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
WITH customer_total_return AS
2+
( SELECT
3+
sr_customer_sk AS ctr_customer_sk,
4+
sr_store_sk AS ctr_store_sk,
5+
sum(sr_return_amt) AS ctr_total_return
6+
FROM store_returns, date_dim
7+
WHERE sr_returned_date_sk = d_date_sk AND d_year = 2000
8+
GROUP BY sr_customer_sk, sr_store_sk)
9+
SELECT c_customer_id
10+
FROM customer_total_return ctr1, store, customer
11+
WHERE ctr1.ctr_total_return >
12+
(SELECT avg(ctr_total_return) * 1.2
13+
FROM customer_total_return ctr2
14+
WHERE ctr1.ctr_store_sk = ctr2.ctr_store_sk)
15+
AND s_store_sk = ctr1.ctr_store_sk
16+
AND s_state = 'TN'
17+
AND ctr1.ctr_customer_sk = c_customer_sk
18+
ORDER BY c_customer_id
19+
LIMIT 100
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
SELECT
2+
cd_gender,
3+
cd_marital_status,
4+
cd_education_status,
5+
count(*) cnt1,
6+
cd_purchase_estimate,
7+
count(*) cnt2,
8+
cd_credit_rating,
9+
count(*) cnt3,
10+
cd_dep_count,
11+
count(*) cnt4,
12+
cd_dep_employed_count,
13+
count(*) cnt5,
14+
cd_dep_college_count,
15+
count(*) cnt6
16+
FROM
17+
customer c, customer_address ca, customer_demographics
18+
WHERE
19+
c.c_current_addr_sk = ca.ca_address_sk AND
20+
ca_county IN ('Rush County', 'Toole County', 'Jefferson County',
21+
'Dona Ana County', 'La Porte County') AND
22+
cd_demo_sk = c.c_current_cdemo_sk AND
23+
exists(SELECT *
24+
FROM store_sales, date_dim
25+
WHERE c.c_customer_sk = ss_customer_sk AND
26+
ss_sold_date_sk = d_date_sk AND
27+
d_year = 2002 AND
28+
d_moy BETWEEN 1 AND 1 + 3) AND
29+
(exists(SELECT *
30+
FROM web_sales, date_dim
31+
WHERE c.c_customer_sk = ws_bill_customer_sk AND
32+
ws_sold_date_sk = d_date_sk AND
33+
d_year = 2002 AND
34+
d_moy BETWEEN 1 AND 1 + 3) OR
35+
exists(SELECT *
36+
FROM catalog_sales, date_dim
37+
WHERE c.c_customer_sk = cs_ship_customer_sk AND
38+
cs_sold_date_sk = d_date_sk AND
39+
d_year = 2002 AND
40+
d_moy BETWEEN 1 AND 1 + 3))
41+
GROUP BY cd_gender,
42+
cd_marital_status,
43+
cd_education_status,
44+
cd_purchase_estimate,
45+
cd_credit_rating,
46+
cd_dep_count,
47+
cd_dep_employed_count,
48+
cd_dep_college_count
49+
ORDER BY cd_gender,
50+
cd_marital_status,
51+
cd_education_status,
52+
cd_purchase_estimate,
53+
cd_credit_rating,
54+
cd_dep_count,
55+
cd_dep_employed_count,
56+
cd_dep_college_count
57+
LIMIT 100
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
WITH year_total AS (
2+
SELECT
3+
c_customer_id customer_id,
4+
c_first_name customer_first_name,
5+
c_last_name customer_last_name,
6+
c_preferred_cust_flag customer_preferred_cust_flag,
7+
c_birth_country customer_birth_country,
8+
c_login customer_login,
9+
c_email_address customer_email_address,
10+
d_year dyear,
11+
sum(ss_ext_list_price - ss_ext_discount_amt) year_total,
12+
's' sale_type
13+
FROM customer, store_sales, date_dim
14+
WHERE c_customer_sk = ss_customer_sk
15+
AND ss_sold_date_sk = d_date_sk
16+
GROUP BY c_customer_id
17+
, c_first_name
18+
, c_last_name
19+
, d_year
20+
, c_preferred_cust_flag
21+
, c_birth_country
22+
, c_login
23+
, c_email_address
24+
, d_year
25+
UNION ALL
26+
SELECT
27+
c_customer_id customer_id,
28+
c_first_name customer_first_name,
29+
c_last_name customer_last_name,
30+
c_preferred_cust_flag customer_preferred_cust_flag,
31+
c_birth_country customer_birth_country,
32+
c_login customer_login,
33+
c_email_address customer_email_address,
34+
d_year dyear,
35+
sum(ws_ext_list_price - ws_ext_discount_amt) year_total,
36+
'w' sale_type
37+
FROM customer, web_sales, date_dim
38+
WHERE c_customer_sk = ws_bill_customer_sk
39+
AND ws_sold_date_sk = d_date_sk
40+
GROUP BY
41+
c_customer_id, c_first_name, c_last_name, c_preferred_cust_flag, c_birth_country,
42+
c_login, c_email_address, d_year)
43+
SELECT t_s_secyear.customer_preferred_cust_flag
44+
FROM year_total t_s_firstyear
45+
, year_total t_s_secyear
46+
, year_total t_w_firstyear
47+
, year_total t_w_secyear
48+
WHERE t_s_secyear.customer_id = t_s_firstyear.customer_id
49+
AND t_s_firstyear.customer_id = t_w_secyear.customer_id
50+
AND t_s_firstyear.customer_id = t_w_firstyear.customer_id
51+
AND t_s_firstyear.sale_type = 's'
52+
AND t_w_firstyear.sale_type = 'w'
53+
AND t_s_secyear.sale_type = 's'
54+
AND t_w_secyear.sale_type = 'w'
55+
AND t_s_firstyear.dyear = 2001
56+
AND t_s_secyear.dyear = 2001 + 1
57+
AND t_w_firstyear.dyear = 2001
58+
AND t_w_secyear.dyear = 2001 + 1
59+
AND t_s_firstyear.year_total > 0
60+
AND t_w_firstyear.year_total > 0
61+
AND CASE WHEN t_w_firstyear.year_total > 0
62+
THEN t_w_secyear.year_total / t_w_firstyear.year_total
63+
ELSE NULL END
64+
> CASE WHEN t_s_firstyear.year_total > 0
65+
THEN t_s_secyear.year_total / t_s_firstyear.year_total
66+
ELSE NULL END
67+
ORDER BY t_s_secyear.customer_preferred_cust_flag
68+
LIMIT 100
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
SELECT
2+
i_item_desc,
3+
i_category,
4+
i_class,
5+
i_current_price,
6+
sum(ws_ext_sales_price) AS itemrevenue,
7+
sum(ws_ext_sales_price) * 100 / sum(sum(ws_ext_sales_price))
8+
OVER
9+
(PARTITION BY i_class) AS revenueratio
10+
FROM
11+
web_sales, item, date_dim
12+
WHERE
13+
ws_item_sk = i_item_sk
14+
AND i_category IN ('Sports', 'Books', 'Home')
15+
AND ws_sold_date_sk = d_date_sk
16+
AND d_date BETWEEN cast('1999-02-22' AS DATE)
17+
AND (cast('1999-02-22' AS DATE) + INTERVAL 30 days)
18+
GROUP BY
19+
i_item_id, i_item_desc, i_category, i_class, i_current_price
20+
ORDER BY
21+
i_category, i_class, i_item_id, i_item_desc, revenueratio
22+
LIMIT 100
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
SELECT
2+
avg(ss_quantity),
3+
avg(ss_ext_sales_price),
4+
avg(ss_ext_wholesale_cost),
5+
sum(ss_ext_wholesale_cost)
6+
FROM store_sales
7+
, store
8+
, customer_demographics
9+
, household_demographics
10+
, customer_address
11+
, date_dim
12+
WHERE s_store_sk = ss_store_sk
13+
AND ss_sold_date_sk = d_date_sk AND d_year = 2001
14+
AND ((ss_hdemo_sk = hd_demo_sk
15+
AND cd_demo_sk = ss_cdemo_sk
16+
AND cd_marital_status = 'M'
17+
AND cd_education_status = 'Advanced Degree'
18+
AND ss_sales_price BETWEEN 100.00 AND 150.00
19+
AND hd_dep_count = 3
20+
) OR
21+
(ss_hdemo_sk = hd_demo_sk
22+
AND cd_demo_sk = ss_cdemo_sk
23+
AND cd_marital_status = 'S'
24+
AND cd_education_status = 'College'
25+
AND ss_sales_price BETWEEN 50.00 AND 100.00
26+
AND hd_dep_count = 1
27+
) OR
28+
(ss_hdemo_sk = hd_demo_sk
29+
AND cd_demo_sk = ss_cdemo_sk
30+
AND cd_marital_status = 'W'
31+
AND cd_education_status = '2 yr Degree'
32+
AND ss_sales_price BETWEEN 150.00 AND 200.00
33+
AND hd_dep_count = 1
34+
))
35+
AND ((ss_addr_sk = ca_address_sk
36+
AND ca_country = 'United States'
37+
AND ca_state IN ('TX', 'OH', 'TX')
38+
AND ss_net_profit BETWEEN 100 AND 200
39+
) OR
40+
(ss_addr_sk = ca_address_sk
41+
AND ca_country = 'United States'
42+
AND ca_state IN ('OR', 'NM', 'KY')
43+
AND ss_net_profit BETWEEN 150 AND 300
44+
) OR
45+
(ss_addr_sk = ca_address_sk
46+
AND ca_country = 'United States'
47+
AND ca_state IN ('VA', 'TX', 'MS')
48+
AND ss_net_profit BETWEEN 50 AND 250
49+
))
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
WITH cross_items AS
2+
(SELECT i_item_sk ss_item_sk
3+
FROM item,
4+
(SELECT
5+
iss.i_brand_id brand_id,
6+
iss.i_class_id class_id,
7+
iss.i_category_id category_id
8+
FROM store_sales, item iss, date_dim d1
9+
WHERE ss_item_sk = iss.i_item_sk
10+
AND ss_sold_date_sk = d1.d_date_sk
11+
AND d1.d_year BETWEEN 1999 AND 1999 + 2
12+
INTERSECT
13+
SELECT
14+
ics.i_brand_id,
15+
ics.i_class_id,
16+
ics.i_category_id
17+
FROM catalog_sales, item ics, date_dim d2
18+
WHERE cs_item_sk = ics.i_item_sk
19+
AND cs_sold_date_sk = d2.d_date_sk
20+
AND d2.d_year BETWEEN 1999 AND 1999 + 2
21+
INTERSECT
22+
SELECT
23+
iws.i_brand_id,
24+
iws.i_class_id,
25+
iws.i_category_id
26+
FROM web_sales, item iws, date_dim d3
27+
WHERE ws_item_sk = iws.i_item_sk
28+
AND ws_sold_date_sk = d3.d_date_sk
29+
AND d3.d_year BETWEEN 1999 AND 1999 + 2) x
30+
WHERE i_brand_id = brand_id
31+
AND i_class_id = class_id
32+
AND i_category_id = category_id
33+
),
34+
avg_sales AS
35+
(SELECT avg(quantity * list_price) average_sales
36+
FROM (
37+
SELECT
38+
ss_quantity quantity,
39+
ss_list_price list_price
40+
FROM store_sales, date_dim
41+
WHERE ss_sold_date_sk = d_date_sk
42+
AND d_year BETWEEN 1999 AND 2001
43+
UNION ALL
44+
SELECT
45+
cs_quantity quantity,
46+
cs_list_price list_price
47+
FROM catalog_sales, date_dim
48+
WHERE cs_sold_date_sk = d_date_sk
49+
AND d_year BETWEEN 1999 AND 1999 + 2
50+
UNION ALL
51+
SELECT
52+
ws_quantity quantity,
53+
ws_list_price list_price
54+
FROM web_sales, date_dim
55+
WHERE ws_sold_date_sk = d_date_sk
56+
AND d_year BETWEEN 1999 AND 1999 + 2) x)
57+
SELECT
58+
channel,
59+
i_brand_id,
60+
i_class_id,
61+
i_category_id,
62+
sum(sales),
63+
sum(number_sales)
64+
FROM (
65+
SELECT
66+
'store' channel,
67+
i_brand_id,
68+
i_class_id,
69+
i_category_id,
70+
sum(ss_quantity * ss_list_price) sales,
71+
count(*) number_sales
72+
FROM store_sales, item, date_dim
73+
WHERE ss_item_sk IN (SELECT ss_item_sk
74+
FROM cross_items)
75+
AND ss_item_sk = i_item_sk
76+
AND ss_sold_date_sk = d_date_sk
77+
AND d_year = 1999 + 2
78+
AND d_moy = 11
79+
GROUP BY i_brand_id, i_class_id, i_category_id
80+
HAVING sum(ss_quantity * ss_list_price) > (SELECT average_sales
81+
FROM avg_sales)
82+
UNION ALL
83+
SELECT
84+
'catalog' channel,
85+
i_brand_id,
86+
i_class_id,
87+
i_category_id,
88+
sum(cs_quantity * cs_list_price) sales,
89+
count(*) number_sales
90+
FROM catalog_sales, item, date_dim
91+
WHERE cs_item_sk IN (SELECT ss_item_sk
92+
FROM cross_items)
93+
AND cs_item_sk = i_item_sk
94+
AND cs_sold_date_sk = d_date_sk
95+
AND d_year = 1999 + 2
96+
AND d_moy = 11
97+
GROUP BY i_brand_id, i_class_id, i_category_id
98+
HAVING sum(cs_quantity * cs_list_price) > (SELECT average_sales FROM avg_sales)
99+
UNION ALL
100+
SELECT
101+
'web' channel,
102+
i_brand_id,
103+
i_class_id,
104+
i_category_id,
105+
sum(ws_quantity * ws_list_price) sales,
106+
count(*) number_sales
107+
FROM web_sales, item, date_dim
108+
WHERE ws_item_sk IN (SELECT ss_item_sk
109+
FROM cross_items)
110+
AND ws_item_sk = i_item_sk
111+
AND ws_sold_date_sk = d_date_sk
112+
AND d_year = 1999 + 2
113+
AND d_moy = 11
114+
GROUP BY i_brand_id, i_class_id, i_category_id
115+
HAVING sum(ws_quantity * ws_list_price) > (SELECT average_sales
116+
FROM avg_sales)
117+
) y
118+
GROUP BY ROLLUP (channel, i_brand_id, i_class_id, i_category_id)
119+
ORDER BY channel, i_brand_id, i_class_id, i_category_id
120+
LIMIT 100

0 commit comments

Comments
 (0)