LambdaMART.py

import xgboost
from tqdm import tqdm
import numpy as np

#https://staff.fnwi.uva.nl/e.kanoulas/wp-content/uploads/Lecture-8-1-LambdaMart-Demystified.pdf
#http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.180.634&rep=rep1&type=pdf

class LambdaMART:

    def __obj_func(self):
        tree = 0
        def func(y_true, y_pred):
            nonlocal tree
            print("Fitting tree #" + str(tree) + " ...")
            tree+=1

            # make step on each group
            for query_index, indexes in enumerate(self.doc_indexes):
                self.queries[query_index].make_step(y_pred[indexes])

            grads = np.concatenate([query.gradients   for query in self.queries])
            hess  = np.concatenate([query.hessians    for query in self.queries])
            # not to set grads to zero on next step
            hess[hess == 0] = 1.0

            return grads, hess

        return func

    def __data_processing(self, X, y, qid, is_test):
        print("Preprocessing data ...")
        self.X = X
        self.y = np.array(y, dtype = np.int32)
        self.queries = []
        self.doc_indexes = []
        # grop by qid
        self.groups = np.unique(qid)
        for query_index in tqdm(self.groups):
            # For each query add num of groups to doc_index
            offset = np.where(qid == query_index)[0]
            self.doc_indexes.append(offset)

            self.queries.append(LambdaMART.QueriesGroup(self.y[self.doc_indexes[-1]], is_test))

    def __init__(self, **kwargs):
        self.params = kwargs
        self.clf = None

    def fit(self, X, y ,qid):
        self.__data_processing(X, y, qid, False)
        print("Fit ...")
        if self.clf == None:
            self.clf = xgboost.XGBRegressor(objective = self.__obj_func(), **self.params)
        self.clf.fit(X, y)

    def predict(self, X, qid):
        results = self.clf.predict(xgboost.DMatrix(X))
        self.__data_processing(X, np.empty(X.shape[0]), qid, True)
        print('Predict ...')
        for query_index, indexes in enumerate(self.doc_indexes) :
            # get sorted positoins of predicted docs
            self.queries[query_index].make_step(results[indexes])

        results = []
        # because docs indexes are in direct order
        # we need to add offset to each query
        offset = 0
        for query_index, _ in enumerate(self.groups):
            doc_pos = self.queries[query_index].positions

            query_pred = np.full((len(doc_pos)), offset)
            query_pred[doc_pos-1] += np.arange(1, len(doc_pos)+1)

            for pos in query_pred:
                results.append(pos)

            offset += len(self.queries[query_index].positions)

        return results

    def save(self, fname):
        self.clf.save_model(fname)

    def load(self, fname):
        self.clf = xgboost.Booster()
        self.clf.load_model(fname)

    class QueriesGroup:
        def __init__(self, assessor, is_test=False):
            self.is_test = is_test
            self.docs_count = len(assessor)
            if not self.is_test:
                self.assessor = np.array(assessor, dtype=np.uint8)                
                # max dcg to normalize dcg 
                self.best_score = np.sum((2.0 ** np.sort(self.assessor)[::-1] - 1) / np.log(np.arange(2, self.docs_count+2)))

                # possible indexes of docs
                self.permutations = np.tile(np.arange(0, self.docs_count, dtype=np.int8), (self.docs_count, 1))
                self.make_step(np.zeros(self.docs_count))
            else:
                self.make_step(np.ones((self.docs_count, self.docs_count)))

        def make_step(self, new_scores):

            self.scores = np.array(new_scores, dtype = np.int32)

            # sorted positions of docs
            self.positions = np.empty(self.docs_count, dtype=np.int32)
            self.positions[np.argsort(-new_scores)] = np.arange(1, self.docs_count+1)

            if not self.is_test:
                # general sigmoid parameter σ (although it turns out that choice of σ does
                # not make affect the model)
                SIGMA = 1.0

                # in group count permutations 
                # where Ui > Uj
                correct_perm = (self.assessor.reshape((-1, 1)) > self.assessor[self.permutations])
                # else
                wrong_perm = (self.assessor.reshape((-1, 1)) < self.assessor[self.permutations])

                # Z - NDCG (might be other utility difference generated by swapping the rank positions)
                dZ = (-1.0 / np.log(self.positions.reshape(-1, 1)+1) + 1.0 / np.log(self.positions[self.permutations]+1)) * \
                        (((2 ** self.assessor.reshape(-1, 1)) - 1) - ((2 ** self.assessor[self.permutations]) - 1))/(self.best_score if self.best_score > 0 else 1.0) 

                # 1/(1 + e^σ(s_i-s_j))
                p_ij = 1.0 / (1 + np.exp(SIGMA * np.abs((self.scores.reshape((-1, 1)) - self.scores[self.permutations]))))

                # gradient (dC)/(ds_i)
                self.gradients = - np.sum(SIGMA * np.abs(dZ) * p_ij * correct_perm - np.abs(dZ) * p_ij * wrong_perm, axis=1)

                # hessian (d^2C)/(ds_i^2)
                self.hessians = np.sum(np.abs(dZ) * SIGMA * SIGMA * p_ij * (1.0 - p_ij) * (correct_perm + wrong_perm), axis=1)