-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLambdaMART.py
133 lines (103 loc) · 5.22 KB
/
LambdaMART.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import xgboost
from tqdm import tqdm
import numpy as np
#https://staff.fnwi.uva.nl/e.kanoulas/wp-content/uploads/Lecture-8-1-LambdaMart-Demystified.pdf
#http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.180.634&rep=rep1&type=pdf
class LambdaMART:
def __obj_func(self):
tree = 0
def func(y_true, y_pred):
nonlocal tree
print("Fitting tree #" + str(tree) + " ...")
tree+=1
# make step on each group
for query_index, indexes in enumerate(self.doc_indexes):
self.queries[query_index].make_step(y_pred[indexes])
grads = np.concatenate([query.gradients for query in self.queries])
hess = np.concatenate([query.hessians for query in self.queries])
# not to set grads to zero on next step
hess[hess == 0] = 1.0
return grads, hess
return func
def __data_processing(self, X, y, qid, is_test):
print("Preprocessing data ...")
self.X = X
self.y = np.array(y, dtype = np.int32)
self.queries = []
self.doc_indexes = []
# grop by qid
self.groups = np.unique(qid)
for query_index in tqdm(self.groups):
# For each query add num of groups to doc_index
offset = np.where(qid == query_index)[0]
self.doc_indexes.append(offset)
self.queries.append(LambdaMART.QueriesGroup(self.y[self.doc_indexes[-1]], is_test))
def __init__(self, **kwargs):
self.params = kwargs
self.clf = None
def fit(self, X, y ,qid):
self.__data_processing(X, y, qid, False)
print("Fit ...")
if self.clf == None:
self.clf = xgboost.XGBRegressor(objective = self.__obj_func(), **self.params)
self.clf.fit(X, y)
def predict(self, X, qid):
results = self.clf.predict(xgboost.DMatrix(X))
self.__data_processing(X, np.empty(X.shape[0]), qid, True)
print('Predict ...')
for query_index, indexes in enumerate(self.doc_indexes) :
# get sorted positoins of predicted docs
self.queries[query_index].make_step(results[indexes])
results = []
# because docs indexes are in direct order
# we need to add offset to each query
offset = 0
for query_index, _ in enumerate(self.groups):
doc_pos = self.queries[query_index].positions
query_pred = np.full((len(doc_pos)), offset)
query_pred[doc_pos-1] += np.arange(1, len(doc_pos)+1)
for pos in query_pred:
results.append(pos)
offset += len(self.queries[query_index].positions)
return results
def save(self, fname):
self.clf.save_model(fname)
def load(self, fname):
self.clf = xgboost.Booster()
self.clf.load_model(fname)
class QueriesGroup:
def __init__(self, assessor, is_test=False):
self.is_test = is_test
self.docs_count = len(assessor)
if not self.is_test:
self.assessor = np.array(assessor, dtype=np.uint8)
# max dcg to normalize dcg
self.best_score = np.sum((2.0 ** np.sort(self.assessor)[::-1] - 1) / np.log(np.arange(2, self.docs_count+2)))
# possible indexes of docs
self.permutations = np.tile(np.arange(0, self.docs_count, dtype=np.int8), (self.docs_count, 1))
self.make_step(np.zeros(self.docs_count))
else:
self.make_step(np.ones((self.docs_count, self.docs_count)))
def make_step(self, new_scores):
self.scores = np.array(new_scores, dtype = np.int32)
# sorted positions of docs
self.positions = np.empty(self.docs_count, dtype=np.int32)
self.positions[np.argsort(-new_scores)] = np.arange(1, self.docs_count+1)
if not self.is_test:
# general sigmoid parameter σ (although it turns out that choice of σ does
# not make affect the model)
SIGMA = 1.0
# in group count permutations
# where Ui > Uj
correct_perm = (self.assessor.reshape((-1, 1)) > self.assessor[self.permutations])
# else
wrong_perm = (self.assessor.reshape((-1, 1)) < self.assessor[self.permutations])
# Z - NDCG (might be other utility difference generated by swapping the rank positions)
dZ = (-1.0 / np.log(self.positions.reshape(-1, 1)+1) + 1.0 / np.log(self.positions[self.permutations]+1)) * \
(((2 ** self.assessor.reshape(-1, 1)) - 1) - ((2 ** self.assessor[self.permutations]) - 1))/(self.best_score if self.best_score > 0 else 1.0)
# 1/(1 + e^σ(s_i-s_j))
p_ij = 1.0 / (1 + np.exp(SIGMA * np.abs((self.scores.reshape((-1, 1)) - self.scores[self.permutations]))))
# gradient (dC)/(ds_i)
self.gradients = - np.sum(SIGMA * np.abs(dZ) * p_ij * correct_perm - np.abs(dZ) * p_ij * wrong_perm, axis=1)
# hessian (d^2C)/(ds_i^2)
self.hessians = np.sum(np.abs(dZ) * SIGMA * SIGMA * p_ij * (1.0 - p_ij) * (correct_perm + wrong_perm), axis=1)