forked from guangyaooo/MLTemplate
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrandom_forest.py
88 lines (76 loc) · 2.95 KB
/
random_forest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import pandas as pd
import numpy as np
from DecisionTree.decision_tree import DecisionTreeClassifier
class RandomForestClassifier(object):
def __init__(self, n_estimators=100, criterion='gini', max_depth=None,
d=None,
random_state=0):
'''
Args:
n_estimators: decision tree numbers
criterion: {"gini", "entropy", "error"}, default="gini"
The function to measure the quality of a split. Supported
criteria are "gini" for the Gini impurity , "entropy" for
the information gain and "error" for the classification error.
max_depth: int, default=None
The maximum depth of the tree. If None, then nodes are expanded
until all leaves are pure.
d: int, default=None
if m is not None, the algorithm will randomly select d features
without replacement
random_state: init random state
'''
self.estimators = [DecisionTreeClassifier(criterion, max_depth, d,
random_state + i) for i in
range(n_estimators)]
self.random_state = random_state
def fit(self, X, y):
'''
fit model.
Args:
X: pandas.DataFrame or numpy.ndarray
N x M, training data
y: pandas.DataFrame or numpy.ndarray
N, training label
Returns:
'''
if isinstance(X, np.ndarray):
X = pd.DataFrame(X)
rgen = np.random.RandomState(self.random_state)
N, _ = X.shape
indices = np.arange(N)
if isinstance(y, (pd.DataFrame, pd.Series)):
y = np.squeeze(y.values)
for estimator in self.estimators:
sampled_indices = rgen.choice(indices, size=N, replace=True)
sampled_X = X.iloc[sampled_indices]
sampled_y = y[sampled_indices]
estimator.fit(sampled_X, sampled_y)
def predict(self, X):
'''
Returns predicted categories of `X`
Args:
X: pandas.DataFrame of numpy.ndarray
input data
Returns:
pred_y: predicted categories of `X`
'''
preds = []
for estimator in self.estimators:
preds.append(estimator.predict(X))
preds = np.asarray(preds)
preds = np.split(preds, preds.shape[1], axis=1)
predict = []
for p in preds:
label, count = np.unique(p.squeeze(), return_counts=True)
predict.append(label[np.argmax(count)])
predict = np.asarray(predict)
return predict
if __name__ == '__main__':
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
model = RandomForestClassifier(n_estimators=100, max_depth=1)
model.fit(X, y)
pred = model.predict(X)
acc = np.mean(pred == y)
print('IRIS Test acc %.4f' % acc)