Skip to content
This repository was archived by the owner on May 9, 2024. It is now read-only.

Commit 6a0c785

Browse files
committed
[Bench] Reference for perfomance analysis
This is an example that compares HDK with Pandas performance. Can be modified by increasing N (originally was 10). See also: #567 Signed-off-by: Dmitrii Makarenko <[email protected]>
1 parent 9d3fcf0 commit 6a0c785

File tree

1 file changed

+130
-0
lines changed

1 file changed

+130
-0
lines changed

join_compileWU_slow_ref.py

+130
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
#!/usr/bin/env python
2+
3+
import sys
4+
import pyhdk
5+
import pandas as pd
6+
import numpy as np
7+
import time
8+
9+
10+
def compare_tables(left_df: pd.DataFrame, right_df: pd.DataFrame):
11+
try_to_guess = True
12+
left_cols = left_df.columns.to_list()
13+
right_cols = right_df.columns.to_list()
14+
left_cols.sort()
15+
right_cols.sort()
16+
17+
diff_idx = [
18+
idx for idx, col_name in enumerate(right_cols) if col_name != left_cols[idx]
19+
]
20+
21+
print("compare lists: ", diff_idx)
22+
drop_left = []
23+
drop_right = []
24+
for drop_idx in diff_idx:
25+
drop_left += [left_cols[drop_idx]]
26+
drop_right += [right_cols[drop_idx]]
27+
if try_to_guess:
28+
right_df = right_df.rename(columns=dict(zip(drop_right, drop_left)))
29+
else:
30+
print("cols: ", left_cols, " drops: ", drop_left)
31+
print("cols: ", right_cols, " drops: ", drop_right)
32+
left_df = left_df.drop(columns=drop_left)
33+
right_df = right_df.drop(columns=drop_right)
34+
35+
left_cols = left_df.columns.to_list()
36+
right_cols = right_df.columns.to_list()
37+
left_cols.sort()
38+
right_cols.sort()
39+
40+
print("cols: r - ", right_cols, " l - ", left_cols)
41+
42+
assert left_cols == right_cols, "Table column names are different"
43+
44+
left_df.sort_values(by=left_cols, inplace=True)
45+
right_df.sort_values(by=left_cols, inplace=True)
46+
for col in left_df.columns:
47+
if left_df[col].dtype in ["category"]:
48+
left_df[col] = left_df[col].astype("str")
49+
right_df[col] = right_df[col].astype("str")
50+
print("l dtypes \n", left_df.dtypes)
51+
print("r dtypes \n", right_df.dtypes)
52+
53+
print("l size: ", left_df.size, " - r size: ", right_df.size)
54+
55+
left_df = left_df.reset_index(drop=True)
56+
right_df = right_df.reset_index(drop=True)
57+
if not all(left_df == right_df):
58+
mask = left_df == right_df
59+
print("Mismathed left: ")
60+
print(left_df[mask])
61+
print(" right: ")
62+
print(left_df[mask])
63+
raise RuntimeError("Results mismatched")
64+
65+
66+
pyhdk_init_args = {}
67+
pyhdk_init_args["enable_debug_timer"] = True
68+
pyhdk_init_args["enable_cpu_groupby_multifrag_kernels"] = False
69+
# pyhdk_init_args["debug_logs"] = True
70+
hdk = pyhdk.init(**pyhdk_init_args)
71+
fragment_size = 4000000
72+
73+
N = 2
74+
75+
np.random.seed(1)
76+
column_list = list()
77+
for num in range(N):
78+
df_setup = {
79+
"column_1": np.random.randint(0, 150, size=(15000)),
80+
"column_3": np.random.randint(0, 6, size=(15000)),
81+
"column_5": np.random.randint(0, 10, size=(15000)),
82+
"A" + str(num): np.random.randint(0, 100, size=(15000)),
83+
"B" + str(num): np.random.randint(0, 100, size=(15000)),
84+
"C" + str(num): np.random.randint(0, 100, size=(15000)),
85+
"D" + str(num): np.random.randint(0, 100, size=(15000)),
86+
}
87+
column_list.append(df_setup)
88+
89+
df_list = list()
90+
for num in range(N):
91+
df = pd.DataFrame(column_list[num])
92+
df_list.append(df)
93+
94+
t1 = time.time()
95+
for idx, df in enumerate(df_list):
96+
if idx == 0:
97+
df_base = df.copy()
98+
df_base = df_base.to_dict("list")
99+
else:
100+
if type(df_base) == dict:
101+
ht_base = hdk.import_pydict(df_base)
102+
else:
103+
ht_base = df_base
104+
df_r = hdk.import_pydict(df.to_dict("list"))
105+
df_ans = ht_base.join(
106+
df_r,
107+
["column_1", "column_3", "column_5"],
108+
["column_1", "column_3", "column_5"],
109+
).run()
110+
df_base = df_ans.to_arrow().to_pandas().to_dict("list")
111+
print("Hdk Time:", time.time() - t1)
112+
113+
t2 = time.time()
114+
for idx, df in enumerate(df_list):
115+
if idx == 0:
116+
df_base_pd = df.copy()
117+
else:
118+
df_base_pd = pd.merge(
119+
df_base_pd,
120+
df,
121+
left_on=["column_1", "column_3", "column_5"],
122+
right_on=["column_1", "column_3", "column_5"],
123+
how="inner",
124+
)
125+
print("Pandas Time:", time.time() - t2)
126+
127+
print("[hdk] shape: ", pd.DataFrame(df_base).shape)
128+
print("[ pd] shape: ", df_base_pd.shape)
129+
print("compare: ")
130+
compare_tables(pd.DataFrame(df_base), df_base_pd)

0 commit comments

Comments
 (0)