Skip to content

Commit b5297ac

Browse files
modify README.md;
增加zhidemai_automl.ipynb.
1 parent 8fb15db commit b5297ac

File tree

4 files changed

+717
-10
lines changed

4 files changed

+717
-10
lines changed

README.md

+17-10
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,23 @@ AutoX一个高效的自动化机器学习工具,它主要针对于表格类型
1313

1414
- [AutoX是什么?](#AutoX是什么?)
1515
- [目录](#目录)
16+
- [安装](#安装)
1617
- [架构](#架构)
1718
- [快速上手](#快速上手)
1819
- [比赛上分点总结](#比赛上分点总结)
1920
- [效果对比](#效果对比)
2021

2122
<!-- /TOC -->
23+
# 安装
24+
```
25+
1. git clone https://github.com/4paradigm/autox.git
26+
2. cd autox
27+
3. python setup.py install
28+
```
2229

2330
# 架构
2431
```
25-
├── automl
26-
│   ├── CONST.py
27-
│   ├── autox.py
32+
├── autox
2833
│   ├── ensemble
2934
│   ├── feature_engineer
3035
│   ├── feature_selection
@@ -34,11 +39,12 @@ AutoX一个高效的自动化机器学习工具,它主要针对于表格类型
3439
│   ├── models
3540
│   ├── process_data
3641
│   └── util.py
37-
├── data
38-
│   ── data01
39-
│   └── data02
40-
── run_demo.py
42+
│   ├── CONST.py
43+
│   ── autox.py
44+
── run_oneclick.py
45+
── demo
4146
└── test
47+
├── setup.py
4248
├── README.md
4349
```
4450

@@ -135,7 +141,7 @@ AutoX类自动为用户管理数据集和数据集信息。
135141
- split_train_test: 将训练集和测试集分开,一般在完成特征工程之后执行
136142
- get_submit: 获得预测结果(中间过程执行了完成的机器学习pipeline,包括数据预处理,特征工程,模型训练,模型调参,模型融合,模型预测等)
137143

138-
# AutoX的整个pipeline包含以下操作,让我们来了解一下其中的具体细节。
144+
# AutoX的pipeline中的操作对应的具体细节:
139145

140146
## 读数据
141147
```
@@ -179,7 +185,8 @@ AutoX类自动为用户管理数据集和数据集信息。
179185
```
180186

181187
- shift特征
182-
188+
```
189+
```
183190

184191
## 模型训练
185192
```
@@ -197,7 +204,7 @@ AutoX支持的模型融合方式包括一下两种,默认情况下,不进行
197204

198205

199206
# 比赛上分点总结:
200-
criteo: 对于nunique很大的特征列,进行分桶操作。例如,对于nunique大于10000的特征,做hash后截断保留4位,再进行label_encode。
207+
kaggle criteo: 对于nunique很大的特征列,进行分桶操作。例如,对于nunique大于10000的特征,做hash后截断保留4位,再进行label_encode。
201208

202209

203210
## 错误排查

demo/zhidemai_autogluon.ipynb

+229
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "7ecc7207",
7+
"metadata": {
8+
"ExecuteTime": {
9+
"end_time": "2021-07-16T06:46:10.332046Z",
10+
"start_time": "2021-07-16T06:46:08.296274Z"
11+
}
12+
},
13+
"outputs": [],
14+
"source": [
15+
"from autogluon.tabular import TabularDataset, TabularPredictor"
16+
]
17+
},
18+
{
19+
"cell_type": "code",
20+
"execution_count": null,
21+
"id": "6bc8546b",
22+
"metadata": {
23+
"ExecuteTime": {
24+
"end_time": "2021-07-16T06:46:58.348088Z",
25+
"start_time": "2021-07-16T06:46:54.098157Z"
26+
},
27+
"scrolled": true
28+
},
29+
"outputs": [],
30+
"source": [
31+
"train_data = TabularDataset('./data/zhidemai/train.csv')"
32+
]
33+
},
34+
{
35+
"cell_type": "code",
36+
"execution_count": null,
37+
"id": "21370fd8",
38+
"metadata": {
39+
"ExecuteTime": {
40+
"end_time": "2021-07-16T06:47:01.557381Z",
41+
"start_time": "2021-07-16T06:47:01.542237Z"
42+
},
43+
"scrolled": false
44+
},
45+
"outputs": [],
46+
"source": [
47+
"train_data.shape"
48+
]
49+
},
50+
{
51+
"cell_type": "code",
52+
"execution_count": null,
53+
"id": "4ba99057",
54+
"metadata": {
55+
"ExecuteTime": {
56+
"end_time": "2021-07-16T06:47:02.571735Z",
57+
"start_time": "2021-07-16T06:47:02.533065Z"
58+
},
59+
"scrolled": true
60+
},
61+
"outputs": [],
62+
"source": [
63+
"train_data.head()"
64+
]
65+
},
66+
{
67+
"cell_type": "code",
68+
"execution_count": null,
69+
"id": "ec3eacc4",
70+
"metadata": {
71+
"ExecuteTime": {
72+
"end_time": "2021-07-16T06:47:13.690516Z",
73+
"start_time": "2021-07-16T06:47:13.685902Z"
74+
},
75+
"scrolled": true
76+
},
77+
"outputs": [],
78+
"source": [
79+
"label = 'orders_3h_15h'"
80+
]
81+
},
82+
{
83+
"cell_type": "code",
84+
"execution_count": null,
85+
"id": "a225cdb6",
86+
"metadata": {
87+
"ExecuteTime": {
88+
"end_time": "2021-07-16T19:25:45.595950Z",
89+
"start_time": "2021-07-16T06:47:30.624411Z"
90+
},
91+
"scrolled": true
92+
},
93+
"outputs": [],
94+
"source": [
95+
"save_path = 'agModels-zhidemai' # specifies folder to store trained models\n",
96+
"predictor = TabularPredictor(label=label, path=save_path).fit(train_data)"
97+
]
98+
},
99+
{
100+
"cell_type": "code",
101+
"execution_count": null,
102+
"id": "fa84c4eb",
103+
"metadata": {
104+
"ExecuteTime": {
105+
"end_time": "2021-07-16T19:25:46.126045Z",
106+
"start_time": "2021-07-16T19:25:45.598171Z"
107+
},
108+
"scrolled": true
109+
},
110+
"outputs": [],
111+
"source": [
112+
"test_data_nolab = TabularDataset('./data/zhidemai/test.csv')"
113+
]
114+
},
115+
{
116+
"cell_type": "code",
117+
"execution_count": null,
118+
"id": "54cce4ce",
119+
"metadata": {
120+
"ExecuteTime": {
121+
"end_time": "2021-07-16T19:25:51.661647Z",
122+
"start_time": "2021-07-16T19:25:46.128238Z"
123+
}
124+
},
125+
"outputs": [],
126+
"source": [
127+
"y_pred = predictor.predict(test_data_nolab)"
128+
]
129+
},
130+
{
131+
"cell_type": "code",
132+
"execution_count": null,
133+
"id": "ca6cc9bf",
134+
"metadata": {
135+
"ExecuteTime": {
136+
"end_time": "2021-07-19T02:39:52.803717Z",
137+
"start_time": "2021-07-19T02:39:52.791822Z"
138+
},
139+
"scrolled": true
140+
},
141+
"outputs": [],
142+
"source": [
143+
"y_pred"
144+
]
145+
},
146+
{
147+
"cell_type": "code",
148+
"execution_count": null,
149+
"id": "240cd16c",
150+
"metadata": {},
151+
"outputs": [],
152+
"source": [
153+
"id_ = ['article_id']\n",
154+
"target = 'orders_3h_15h'"
155+
]
156+
},
157+
{
158+
"cell_type": "code",
159+
"execution_count": null,
160+
"id": "b4fd354e",
161+
"metadata": {},
162+
"outputs": [],
163+
"source": [
164+
"sub = test_data_nolab[id_].copy()\n",
165+
"sub[target] = list(y_pred.values)"
166+
]
167+
},
168+
{
169+
"cell_type": "code",
170+
"execution_count": null,
171+
"id": "c59b336a",
172+
"metadata": {},
173+
"outputs": [],
174+
"source": [
175+
"sub.to_csv(\"./autogluon_sub.csv\", index = False)"
176+
]
177+
},
178+
{
179+
"cell_type": "code",
180+
"execution_count": null,
181+
"id": "c43926aa",
182+
"metadata": {},
183+
"outputs": [],
184+
"source": []
185+
},
186+
{
187+
"cell_type": "code",
188+
"execution_count": null,
189+
"id": "6b54628e",
190+
"metadata": {},
191+
"outputs": [],
192+
"source": []
193+
}
194+
],
195+
"metadata": {
196+
"kernelspec": {
197+
"display_name": "Python 3",
198+
"language": "python",
199+
"name": "python3"
200+
},
201+
"language_info": {
202+
"codemirror_mode": {
203+
"name": "ipython",
204+
"version": 3
205+
},
206+
"file_extension": ".py",
207+
"mimetype": "text/x-python",
208+
"name": "python",
209+
"nbconvert_exporter": "python",
210+
"pygments_lexer": "ipython3",
211+
"version": "3.7.10"
212+
},
213+
"toc": {
214+
"base_numbering": 1,
215+
"nav_menu": {},
216+
"number_sections": true,
217+
"sideBar": true,
218+
"skip_h1_title": false,
219+
"title_cell": "Table of Contents",
220+
"title_sidebar": "Contents",
221+
"toc_cell": false,
222+
"toc_position": {},
223+
"toc_section_display": true,
224+
"toc_window_display": false
225+
}
226+
},
227+
"nbformat": 4,
228+
"nbformat_minor": 5
229+
}

0 commit comments

Comments
 (0)