-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathwalmart_to_pickle.py
138 lines (107 loc) · 3.88 KB
/
walmart_to_pickle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# %%
# Load sales
df_sales = pd.read_csv("sales_train_validation.csv")
# %%
cols_id = {
"item_id": "category",
"dept_id": "category",
"cat_id": "category",
"store_id": "category",
"state_id": "category",
}
# Target columns after unpivot
cols_sales = {"d": "object", "sales_units": np.uint16, **cols_id}
# Redundant column, drop to save memory
df_sales.drop(columns=["id"], inplace=True)
# Unpivot to convert columns [d_1..d_1913] -> columns [d, sales_units]
df = df_sales.melt(
id_vars=cols_id.keys(),
var_name="d", # Column "d" in calendar.csv corresponds with this
value_name="sales_units",
)[cols_sales.keys()].astype(cols_sales)
# %%
# Load calendar and merge into df (using d, provides wm_yr_wk to merge prices later)
cols_calendar = {
"d": "object",
"wm_yr_wk": np.uint32,
"event_name_1": "category",
"event_name_2": "category",
"event_type_1": "category",
"event_type_2": "category",
"snap_CA": np.uint8,
"snap_TX": np.uint8,
"snap_WI": np.uint8,
}
df_calendar = pd.read_csv("calendar.csv", parse_dates=["date"], dtype=cols_calendar)
# %%
cols_calendar_filter = ["date"] + list(
cols_calendar.keys()
) # Filter redundant columns in calendar
df = df.merge(df_calendar[cols_calendar_filter], on="d", how="left",)
df.drop(columns=["d"], inplace=True) # No longer needed
# %%
# Load prices and merge into df (using wm_yr_wk from calendar)
cols_prices = {
"store_id": "category",
"item_id": "category",
"wm_yr_wk": np.uint32,
"sell_price": np.float32,
}
df_prices = pd.read_csv("sell_prices.csv", dtype=cols_prices)
# %%
df = df.merge(df_prices, on=["store_id", "item_id", "wm_yr_wk"], how="left",)
df.drop(columns=["wm_yr_wk"], inplace=True)
# %%
# Check if there are sales with undefined price
print(f"Sales without shelf price: {len(df[df['sell_price'].isna() & (df['sales_units'] != 0)])}")
# Remove rows without prices (product was not available at that time)
print(f"Rows without shelf price and no sales (product disabled): {df['sell_price'].isna().sum()}")
df = df[~df["sell_price"].isna()]
# %%
df.info()
# %%
# Get products grouped by amount of sales
median_sales = (
df.groupby(["item_id", "store_id"])["sales_units"].median().sort_values(ascending=False)
)
median_sales.hist()
median_sales
# %%
def plot_sales_price(df_item_store):
# Avoid filtering id[0] to show better if df_item_store is not properly filtered
item_id = df_item_store["item_id"].astype(str).unique()
store_id = df_item_store["store_id"].astype(str).unique()
title = f"Item: {item_id} | Store: {store_id}"
fig, ax = plt.subplots()
ax.plot(df_item_store["date"], df_item_store["sales_units"], "r-", alpha=0.5)
ax2 = ax.twinx()
ax2.plot(df_item_store["date"], df_item_store["sell_price"], "b-", alpha=0.5)
ax2.set_ylabel("Shelf price")
ax.set_ylabel("Sales units")
ax.set_title(title)
ax.tick_params(axis="x", rotation=90)
# %%
for item_id, store_id in median_sales[:30].index:
# Filter df by item/location
df_item_store = df[(df["item_id"] == item_id) & (df["store_id"] == store_id)]
# Plot sales/price as a function of time
plot_sales_price(df_item_store)
# %%
# Subset of the dataframe to save/train model
mask_X = np.zeros(shape=len(df)).astype(bool)
# E.g: pick products with greater amount of sales
for item_id, store_id in median_sales[:30].index:
mask_X = mask_X | ((df["item_id"] == item_id) & (df["store_id"] == store_id))
# E.g: remove rows previous to 2014
mask_X = mask_X & (df["date"] >= pd.Timestamp(2014, 1, 1))
# Filter and show statistics of selected data
df_X = df[mask_X]
print(f"Rows to use in X: {mask_X.sum()} / {len(df)} ({100 * mask_X.sum()/len(df):.2f}%)")
print(f"Dates: {df_X['date'].min()}/{df_X['date'].max()} | # items: {df_X['item_id'].nunique()}")
# %%
df_X.to_pickle("df_max_sales.pkl")
# %%