From acdc469e39a193882f4559240b202eccb66cf0c8 Mon Sep 17 00:00:00 2001 From: meng-ustc Date: Mon, 1 Feb 2021 21:05:34 +0900 Subject: [PATCH 01/11] Add A New Baseline: DoubleEnsemble --- .../DoubleEnsemble/requirements.txt | 3 + ...rkflow_config_doubleensemble_Alpha158.yaml | 91 +++++++ ...rkflow_config_doubleensemble_Alpha360.yaml | 98 +++++++ qlib/contrib/model/double_ensemble.py | 241 ++++++++++++++++++ 4 files changed, 433 insertions(+) create mode 100644 examples/benchmarks/DoubleEnsemble/requirements.txt create mode 100644 examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml create mode 100644 examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml create mode 100644 qlib/contrib/model/double_ensemble.py diff --git a/examples/benchmarks/DoubleEnsemble/requirements.txt b/examples/benchmarks/DoubleEnsemble/requirements.txt new file mode 100644 index 0000000000..f9117361d5 --- /dev/null +++ b/examples/benchmarks/DoubleEnsemble/requirements.txt @@ -0,0 +1,3 @@ +pandas==1.1.2 +numpy==1.17.4 +lightgbm==3.1.0 \ No newline at end of file diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml new file mode 100644 index 0000000000..b5c669c28f --- /dev/null +++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml @@ -0,0 +1,91 @@ +qlib_init: + provider_uri: "~/.qlib/qlib_data/cn_data" + region: cn +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy.strategy + kwargs: + topk: 50 + n_drop: 5 + backtest: + verbose: False + limit_threshold: 0.095 + account: 100000000 + benchmark: *benchmark + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: DEnsembleModel + module_path: qlib.contrib.model.double_ensemble + kwargs: + base: "gbm" + loss: mse + k: 6 + enable_sr: True + enable_fs: True + alpha1: 1 + alpha2: 1 + bins_sr: 10 + bins_fs: 5 + decay: 0.5 + sample_ratios: + - 0.8 + - 0.7 + - 0.6 + - 0.5 + - 0.4 + sub_weights: + - 1 + - 0.2 + - 0.2 + - 0.2 + - 0.2 + - 0.2 + colsample_bytree: 0.8879 + learning_rate: 0.2 + subsample: 0.8789 + lambda_l1: 205.6999 + lambda_l2: 580.9768 + max_depth: 8 + num_leaves: 210 + num_threads: 20 + verbosity: -1 + num_iterations: 28 + early_stopping_round: None + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: Alpha158 + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: {} + - class: SigAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + ana_long_short: False + ann_scaler: 252 + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config \ No newline at end of file diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml new file mode 100644 index 0000000000..b31eac4cef --- /dev/null +++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml @@ -0,0 +1,98 @@ +qlib_init: + provider_uri: "~/.qlib/qlib_data/cn_data" + region: cn +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market + infer_processors: [] + learn_processors: + - class: DropnaLabel + - class: CSRankNorm + kwargs: + fields_group: label + label: ["Ref($close, -2) / Ref($close, -1) - 1"] +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy.strategy + kwargs: + topk: 50 + n_drop: 5 + backtest: + verbose: False + limit_threshold: 0.095 + account: 100000000 + benchmark: *benchmark + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: DEnsembleModel + module_path: qlib.contrib.model.double_ensemble + kwargs: + base: "gbm" + loss: mse + k: 6 + enable_sr: True + enable_fs: True + alpha1: 1 + alpha2: 1 + bins_sr: 10 + bins_fs: 5 + decay: 0.5 + sample_ratios: + - 0.8 + - 0.7 + - 0.6 + - 0.5 + - 0.4 + sub_weights: + - 1 + - 0.2 + - 0.2 + - 0.2 + - 0.2 + - 0.2 + colsample_bytree: 0.8879 + learning_rate: 0.0421 + subsample: 0.8789 + lambda_l1: 205.6999 + lambda_l2: 580.9768 + max_depth: 8 + num_leaves: 210 + num_threads: 20 + verbosity: -1 + num_iterations: 28 + early_stopping_round: None + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: Alpha360 + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: {} + - class: SigAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + ana_long_short: False + ann_scaler: 252 + - class: PortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config \ No newline at end of file diff --git a/qlib/contrib/model/double_ensemble.py b/qlib/contrib/model/double_ensemble.py new file mode 100644 index 0000000000..68680b8145 --- /dev/null +++ b/qlib/contrib/model/double_ensemble.py @@ -0,0 +1,241 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import lightgbm as lgb +import numpy as np +import pandas as pd + +from ...model.base import Model +from ...data.dataset import DatasetH +from ...data.dataset.handler import DataHandlerLP +from ...log import get_module_logger + + +class DEnsembleModel(Model): + """Double Ensemble Model""" + + def __init__( + self, + base="gbm", + loss="mse", + k=6, + enable_sr=True, + enable_fs=True, + alpha1=1., + alpha2=1., + bins_sr=10, + bins_fs=5, + decay=None, + sample_ratios=None, + sub_weights=None, + **kwargs): + self.base = base # "gbm" or "mlp", specifically, we use lgbm for "gbm" + self.k = k + self.enable_sr = enable_sr + self.enable_fs = enable_fs + self.alpha1 = alpha1 + self.alpha2 = alpha2 + self.bins_sr = bins_sr + self.bins_fs = bins_fs + self.decay = decay + if not len(sample_ratios) == bins_fs: + raise ValueError("The length of sample_ratios should be equal to bins_fs.") + self.sample_ratios = sample_ratios + if not len(sub_weights) == k: + raise ValueError("The length of sub_weights should be equal to k.") + self.sub_weights = sub_weights + self.logger = get_module_logger("DEnsembleModel") + self.logger.info("Double Ensemble Model...") + self.ensemble = [] # the current ensemble model, a list contains all the sub-models + self.sub_features = [] # the features for each sub model in the form of pandas.Index + self.params = {"objective": loss} + self.params.update(kwargs) + self.loss = loss + + def fit( + self, + dataset: DatasetH + ): + df_train, df_valid = dataset.prepare( + ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L + ) + x_train, y_train = df_train["feature"], df_train["label"] + # initialize the sample weights + N, F = x_train.shape + weights = pd.Series(np.ones(N, dtype=float)) + # initialize the features + features = x_train.columns + pred_sub = pd.DataFrame(np.zeros((N, self.k), dtype=float), index=x_train.index) + # train k sub-models + for i_k in range(self.k): + self.sub_features.append(features) + self.logger.info("Training sub-model: ({}/{})".format(i_k+1, self.k)) + model_k = self.train_submodel(df_train, df_valid, weights, features) + self.ensemble.append(model_k) + # no further sample re-weight and feature selection needed for the last sub-model + if i_k + 1 == self.k: + break + + self.logger.info("Retrieving loss curve and loss values...") + loss_curve = self.retrieve_loss_curve(model_k, df_train, features) + pred_k = self.predict_sub(model_k, df_train, features) + pred_sub.iloc[:, i_k] = pred_k + pred_ensemble = pred_sub.iloc[:, :i_k+1].mean(axis=1) + loss_values = pd.Series(self.get_loss(y_train.values.squeeze(), pred_ensemble.values)) + + if self.enable_sr: + self.logger.info("Sample re-weighting...") + weights = self.sample_reweight(loss_curve, loss_values, i_k+1) + + if self.enable_fs: + self.logger.info("Feature selection...") + features = self.feature_selection(df_train, loss_values) + + def train_submodel(self, df_train, df_valid, weights, features): + dtrain, dvalid = self._prepare_data_gbm(df_train, df_valid, weights, features) + evals_result = dict() + model = lgb.train( + self.params, + dtrain, + valid_sets=[dtrain, dvalid], + valid_names=["train", "valid"], + verbose_eval=20, + evals_result=evals_result, + ) + evals_result["train"] = list(evals_result["train"].values())[0] + evals_result["valid"] = list(evals_result["valid"].values())[0] + return model + + def _prepare_data_gbm(self, df_train, df_valid, weights, features): + x_train, y_train = df_train["feature"].loc[:, features], df_train["label"] + x_valid, y_valid = df_valid["feature"].loc[:, features], df_valid["label"] + + # Lightgbm need 1D array as its label + if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: + y_train, y_valid = np.squeeze(y_train.values), np.squeeze(y_valid.values) + else: + raise ValueError("LightGBM doesn't support multi-label training") + + dtrain = lgb.Dataset(x_train.values, label=y_train, weight=weights) + dvalid = lgb.Dataset(x_valid.values, label=y_valid) + return dtrain, dvalid + + def sample_reweight(self, loss_curve, loss_values, k_th): + """ + the SR module of Double Ensemble + :param loss_curve: the shape is NxT + the loss curve for the previous sub-model, where the element (i, t) if the error on the i-th sample + after the t-th iteration in the training of the previous sub-model. + :param loss_values: the shape is N + the loss of the current ensemble on the i-th sample. + :param k_th: the index of the current sub-model, starting from 1 + :return: weights + the weights for all the samples. + """ + # normalize loss_curve and loss_values with ranking + loss_curve_norm = loss_curve.rank(axis=0, pct=True) + loss_values_norm = (-loss_values).rank(pct=True) + + # calculate l_start and l_end from loss_curve + N, T = loss_curve.shape + part = np.maximum(int(T * 0.1), 1) + l_start = loss_curve_norm.iloc[:, :part].mean(axis=1) + l_end = loss_curve_norm.iloc[:, -part:].mean(axis=1) + + # calculate h-value for each sample + h1 = loss_values_norm + h2 = (l_end / l_start).rank(pct=True) + h = pd.DataFrame({'h_value': self.alpha1 * h1 + self.alpha2 * h2}) + + # calculate weights + h['bins'] = pd.cut(h['h_value'], self.bins_sr) + h_avg = h.groupby('bins')['h_value'].mean() + weights = pd.Series(np.zeros(N, dtype=float)) + for i_b, b in enumerate(h_avg.index): + weights[h['bins'] == b] = 1. / (self.decay ** k_th * h_avg[i_b] + 0.1) + return weights + + def feature_selection(self, df_train, loss_values): + """ + the FS module of Double Ensemble + :param df_train: the shape is NxF + :param loss_values: the shape is N + the loss of the current ensemble on the i-th sample. + :return: res_feat: in the form of pandas.Index + + """ + x_train, y_train = df_train["feature"], df_train["label"] + features = x_train.columns + N, F = x_train.shape + g = pd.DataFrame({'g_value': np.zeros(F, dtype=float)}) + M = len(self.ensemble) + + # shuffle specific columns and calculate g-value for each feature + x_train_tmp = x_train.copy() + for i_f, feat in enumerate(features): + x_train_tmp.loc[:, feat] = np.random.permutation(x_train_tmp.loc[:, feat].values) + pred = pd.Series(np.zeros(N), index=x_train_tmp.index) + for i_s, submodel in enumerate(self.ensemble): + pred += pd.Series(submodel.predict(x_train_tmp.loc[:, self.sub_features[i_s]].values), + index=x_train_tmp.index) / M + loss_feat = self.get_loss(y_train.values.squeeze(), pred.values) + g.loc[i_f, 'g_value'] = np.mean(loss_feat - loss_values) / np.std(loss_feat - loss_values) + x_train_tmp.loc[:, feat] = x_train.loc[:, feat].copy() + + # one column in train features is all-nan # if g['g_value'].isna().any() + g['g_value'].replace(np.nan, 0, inplace=True) + + # divide features into bins_fs bins + g['bins'] = pd.cut(g['g_value'], self.bins_fs) + + # randomly sample features from bins to construct the new features + res_feat = [] + sorted_bins = sorted(g['bins'].unique(), reverse=True) + for i_b, b in enumerate(sorted_bins): + b_feat = features[g['bins'] == b] + num_feat = int(np.ceil(self.sample_ratios[i_b] * len(b_feat))) + res_feat = res_feat + np.random.choice(b_feat, size=num_feat).tolist() + return pd.Index(res_feat) + + def get_loss(self, label, pred): + if self.loss == "mse": + return (label - pred) ** 2 + else: + raise ValueError("not implemented yet") + + def retrieve_loss_curve(self, model, df_train, features): + if self.base == "gbm": + num_trees = model.num_trees() + x_train, y_train = df_train["feature"].loc[:, features], df_train["label"] + # Lightgbm need 1D array as its label + if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: + y_train = np.squeeze(y_train.values) + else: + raise ValueError("LightGBM doesn't support multi-label training") + + N = x_train.shape[0] + loss_curve = pd.DataFrame(np.zeros((N, num_trees))) + pred_tree = np.zeros(N, dtype=float) + for i_tree in range(num_trees): + pred_tree += model.predict(x_train.values, start_iteration=i_tree, num_iteration=1) + loss_curve.iloc[:, i_tree] = self.get_loss(y_train, pred_tree) + else: + raise ValueError("not implemented yet") + return loss_curve + + def predict(self, dataset): + if self.ensemble is None: + raise ValueError("model is not fitted yet!") + x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I) + pred = pd.Series(np.zeros(x_test.shape[0]), index=x_test.index) + for i_sub, submodel in enumerate(self.ensemble): + feat_sub = self.sub_features[i_sub] + pred += pd.Series(submodel.predict(x_test.loc[:, feat_sub].values), index=x_test.index) * self.sub_weights[i_sub] + return pred + + def predict_sub(self, submodel, df_data, features): + x_data, y_data = df_data["feature"].loc[:, features], df_data["label"] + pred_sub = pd.Series(submodel.predict(x_data.values), index=x_data.index) + return pred_sub + + From 8c3ec164ff5bb73fea2cc0d63f91776148ed2760 Mon Sep 17 00:00:00 2001 From: meng-ustc Date: Tue, 2 Feb 2021 11:46:37 +0900 Subject: [PATCH 02/11] Add A New Baseline: DoubleEnsemble --- ...rkflow_config_doubleensemble_Alpha158.yaml | 5 +- ...rkflow_config_doubleensemble_Alpha360.yaml | 47 +++++++++---------- examples/run_all_model.py | 2 +- qlib/contrib/model/double_ensemble.py | 3 ++ 4 files changed, 29 insertions(+), 28 deletions(-) diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml index b5c669c28f..c468eb320a 100644 --- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml +++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml @@ -33,7 +33,7 @@ task: base: "gbm" loss: mse k: 6 - enable_sr: True + enable_sr: False enable_fs: True alpha1: 1 alpha2: 1 @@ -53,6 +53,7 @@ task: - 0.2 - 0.2 - 0.2 + epochs: 28 colsample_bytree: 0.8879 learning_rate: 0.2 subsample: 0.8789 @@ -62,8 +63,6 @@ task: num_leaves: 210 num_threads: 20 verbosity: -1 - num_iterations: 28 - early_stopping_round: None dataset: class: DatasetH module_path: qlib.data.dataset diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml index b31eac4cef..3351cefc5a 100644 --- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml +++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml @@ -38,28 +38,29 @@ task: module_path: qlib.contrib.model.double_ensemble kwargs: base: "gbm" - loss: mse - k: 6 - enable_sr: True - enable_fs: True - alpha1: 1 - alpha2: 1 - bins_sr: 10 - bins_fs: 5 - decay: 0.5 - sample_ratios: - - 0.8 - - 0.7 - - 0.6 - - 0.5 - - 0.4 - sub_weights: - - 1 - - 0.2 - - 0.2 - - 0.2 - - 0.2 - - 0.2 + loss: mse + k: 6 + enable_sr: True + enable_fs: True + alpha1: 1 + alpha2: 1 + bins_sr: 10 + bins_fs: 5 + decay: 0.5 + sample_ratios: + - 0.8 + - 0.7 + - 0.6 + - 0.5 + - 0.4 + sub_weights: + - 1 + - 0.2 + - 0.2 + - 0.2 + - 0.2 + - 0.2 + epochs: 136 colsample_bytree: 0.8879 learning_rate: 0.0421 subsample: 0.8789 @@ -69,8 +70,6 @@ task: num_leaves: 210 num_threads: 20 verbosity: -1 - num_iterations: 28 - early_stopping_round: None dataset: class: DatasetH module_path: qlib.data.dataset diff --git a/examples/run_all_model.py b/examples/run_all_model.py index d587eff155..1809bac2e5 100644 --- a/examples/run_all_model.py +++ b/examples/run_all_model.py @@ -265,7 +265,7 @@ def run(times=1, models=None, dataset="Alpha360", exclude=False): ) # TODO: FIX ME! else: execute( - f"cd {env_path} && {python_path} -m pip install --upgrade --force-reinstall -e git+https://github.com/microsoft/qlib#egg=pyqlib" + f"cd {env_path} && {python_path} -m pip install --upgrade --force-reinstall -e git+https://github.com/meng-ustc/qlib#egg=pyqlib" ) # TODO: FIX ME! sys.stderr.write("\n") # run workflow_by_config for multiple times diff --git a/qlib/contrib/model/double_ensemble.py b/qlib/contrib/model/double_ensemble.py index 68680b8145..786b3987cf 100644 --- a/qlib/contrib/model/double_ensemble.py +++ b/qlib/contrib/model/double_ensemble.py @@ -28,6 +28,7 @@ def __init__( decay=None, sample_ratios=None, sub_weights=None, + epochs=100, **kwargs): self.base = base # "gbm" or "mlp", specifically, we use lgbm for "gbm" self.k = k @@ -44,6 +45,7 @@ def __init__( if not len(sub_weights) == k: raise ValueError("The length of sub_weights should be equal to k.") self.sub_weights = sub_weights + self.epochs = epochs self.logger = get_module_logger("DEnsembleModel") self.logger.info("Double Ensemble Model...") self.ensemble = [] # the current ensemble model, a list contains all the sub-models @@ -97,6 +99,7 @@ def train_submodel(self, df_train, df_valid, weights, features): model = lgb.train( self.params, dtrain, + num_boost_round=self.epochs, valid_sets=[dtrain, dvalid], valid_names=["train", "valid"], verbose_eval=20, From fd5c68a7d1b9e9074464431ab384dfbdda7aa206 Mon Sep 17 00:00:00 2001 From: Meng Dong Date: Tue, 2 Feb 2021 12:39:07 +0800 Subject: [PATCH 03/11] Update workflow_config_doubleensemble_Alpha158.yaml --- .../workflow_config_doubleensemble_Alpha158.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml index c468eb320a..74923c0e5e 100644 --- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml +++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml @@ -33,7 +33,7 @@ task: base: "gbm" loss: mse k: 6 - enable_sr: False + enable_sr: True enable_fs: True alpha1: 1 alpha2: 1 @@ -87,4 +87,4 @@ task: - class: PortAnaRecord module_path: qlib.workflow.record_temp kwargs: - config: *port_analysis_config \ No newline at end of file + config: *port_analysis_config From d27dc8bab8955bc8900c1a0a0d9d564cd4c9400b Mon Sep 17 00:00:00 2001 From: meng-ustc Date: Thu, 18 Feb 2021 19:02:33 +0900 Subject: [PATCH 04/11] Add A New Baseline: DoubleEnsemble --- examples/run_all_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/run_all_model.py b/examples/run_all_model.py index 1809bac2e5..d587eff155 100644 --- a/examples/run_all_model.py +++ b/examples/run_all_model.py @@ -265,7 +265,7 @@ def run(times=1, models=None, dataset="Alpha360", exclude=False): ) # TODO: FIX ME! else: execute( - f"cd {env_path} && {python_path} -m pip install --upgrade --force-reinstall -e git+https://github.com/meng-ustc/qlib#egg=pyqlib" + f"cd {env_path} && {python_path} -m pip install --upgrade --force-reinstall -e git+https://github.com/microsoft/qlib#egg=pyqlib" ) # TODO: FIX ME! sys.stderr.write("\n") # run workflow_by_config for multiple times From 42590972e49457d659749a65e8c8ad02172c8fe5 Mon Sep 17 00:00:00 2001 From: meng-ustc Date: Thu, 18 Feb 2021 19:15:02 +0900 Subject: [PATCH 05/11] Modify run_all_model.py --- examples/run_all_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/run_all_model.py b/examples/run_all_model.py index d587eff155..f9d4a62a04 100644 --- a/examples/run_all_model.py +++ b/examples/run_all_model.py @@ -265,7 +265,7 @@ def run(times=1, models=None, dataset="Alpha360", exclude=False): ) # TODO: FIX ME! else: execute( - f"cd {env_path} && {python_path} -m pip install --upgrade --force-reinstall -e git+https://github.com/microsoft/qlib#egg=pyqlib" + f"cd {env_path} && {python_path} -m pip install --upgrade --force-reinstall -e git+https://github.com/microsoft----/qlib#egg=pyqlib" ) # TODO: FIX ME! sys.stderr.write("\n") # run workflow_by_config for multiple times From cd5b721bc6e91de0d0c83978ec14475a52f94a2e Mon Sep 17 00:00:00 2001 From: meng-ustc Date: Fri, 19 Feb 2021 11:56:50 +0900 Subject: [PATCH 06/11] Update --- examples/run_all_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/run_all_model.py b/examples/run_all_model.py index f9d4a62a04..d587eff155 100644 --- a/examples/run_all_model.py +++ b/examples/run_all_model.py @@ -265,7 +265,7 @@ def run(times=1, models=None, dataset="Alpha360", exclude=False): ) # TODO: FIX ME! else: execute( - f"cd {env_path} && {python_path} -m pip install --upgrade --force-reinstall -e git+https://github.com/microsoft----/qlib#egg=pyqlib" + f"cd {env_path} && {python_path} -m pip install --upgrade --force-reinstall -e git+https://github.com/microsoft/qlib#egg=pyqlib" ) # TODO: FIX ME! sys.stderr.write("\n") # run workflow_by_config for multiple times From 1a990fdd25e5fd25e6b868d56122f92b7caa30f7 Mon Sep 17 00:00:00 2001 From: meng-ustc Date: Tue, 23 Feb 2021 19:08:11 +0900 Subject: [PATCH 07/11] Add Risk Prediction Demo --- examples/workflow_by_code_lgb_risk_demo.py | 179 +++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 examples/workflow_by_code_lgb_risk_demo.py diff --git a/examples/workflow_by_code_lgb_risk_demo.py b/examples/workflow_by_code_lgb_risk_demo.py new file mode 100644 index 0000000000..b250993d33 --- /dev/null +++ b/examples/workflow_by_code_lgb_risk_demo.py @@ -0,0 +1,179 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import sys +from pathlib import Path + +import qlib +from qlib.config import REG_CN +from qlib.utils import exists_qlib_data, init_instance_by_config, flatten_dict +from qlib.workflow import R +from qlib.workflow.record_temp import SignalRecord, PortAnaRecord +from qlib.data.dataset.handler import DataHandlerLP + +import seaborn as sns +import matplotlib.pyplot as plt +import math +import pandas as pd +from scipy.stats.stats import pearsonr +import numpy as np + +if __name__ == "__main__": + + # use default data + provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir + if not exists_qlib_data(provider_uri): + print(f"Qlib data is not found in {provider_uri}") + sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) + from get_data import GetData + + GetData().qlib_data(target_dir=provider_uri, region=REG_CN) + + qlib.init(provider_uri=provider_uri, region=REG_CN) + + market = "csi300" + benchmark = "SH000300" + + ################################### + # train model + ################################### + data_handler_config = { + "start_time": "2008-01-01", + "end_time": "2020-08-01", + "fit_start_time": "2008-01-01", + "fit_end_time": "2014-12-31", + "instruments": market, + "infer_processors": [ + {"class": "ProcessInf", "kwargs": {}}, + {"class": "ZScoreNorm", "kwargs": {"fields_group": "feature"}}, + {"class": "Fillna", "kwargs": {}}, + ], + "learn_processors": [{ + "class": "DropnaLabel", }, + ], + "label": (["Ref(Min($low, 5), -4)/$close - 1"], ["LABEL0"]) # the period for risk prediction is 5 days + } + + task = { + "model": { + "class": "LGBModel", + "module_path": "qlib.contrib.model.gbdt", + "kwargs": { + "loss": "mse", + "colsample_bytree": 0.8999, + "learning_rate": 0.02, + "subsample": 0.7, + "lambda_l1": 11.9668, + "lambda_l2": 339.1301, + "max_depth": 16, + "num_leaves": 31, + "num_threads": 20, + }, + }, + "dataset": { + "class": "DatasetH", + "module_path": "qlib.data.dataset", + "kwargs": { + "handler": { + "class": "Alpha360", + "module_path": "qlib.contrib.data.handler", + "kwargs": data_handler_config, + }, + "segments": { + "train": ("2008-01-01", "2014-12-31"), + "valid": ("2015-01-01", "2016-12-31"), + "test": ("2017-01-01", "2020-08-01"), + }, + }, + }, + } + + port_analysis_config = { + "strategy": { + "class": "TopkDropoutStrategy", + "module_path": "qlib.contrib.strategy.strategy", + "kwargs": { + "topk": 50, + "n_drop": 5, + }, + }, + "backtest": { + "verbose": False, + "limit_threshold": 0.095, + "account": 100000000, + "benchmark": benchmark, + "deal_price": "close", + "open_cost": 0.0005, + "close_cost": 0.0015, + "min_cost": 5, + "return_order": True, + }, + } + + # model initiaiton + model = init_instance_by_config(task["model"]) + dataset = init_instance_by_config(task["dataset"]) + + # NOTE: This line is optional + # It demonstrates that the dataset can be used standalone. + example_df = dataset.prepare("train") + print(example_df.head()) + + def heatmap(actual_risk, predicted_risk, step=0.02): + """ + plot the precision heatmap as a visualized evaluation for risk predition + :param actual_risk: the LABEL0 of test samples + :param predicted_risk: the predicted results of test samples + :param step: the internal size of risk values on axis + :return: + """ + num_step = math.ceil(-predicted_risk.min() / step) + matrix = np.zeros((num_step, num_step), dtype=np.float) + for pred_thresh in range(num_step): + for act_thresh in range(num_step): + actual_positive = actual_risk < -act_thresh*step + predicted_alarm = predicted_risk < -pred_thresh*step + num_alarm = predicted_alarm.sum() + num_tp = (actual_positive & predicted_alarm).sum() + matrix[pred_thresh, act_thresh] = num_tp / num_alarm + axis_labels = ['{:.3f}'.format(-x * step) for x in range(num_step)] + return matrix, axis_labels + + # start exp + with R.start(experiment_name="workflow"): + R.log_params(**flatten_dict(task)) + model.fit(dataset) + + # prediction + actual_risk = dataset.prepare("test", col_set="label", data_key=DataHandlerLP.DK_I)['LABEL0'] + pred = model.predict(dataset) + + result_df = pd.concat((actual_risk, pred), axis=1) + result_df.columns = ['Actual Risk', 'Predicted Risk'] + result_df.dropna(inplace=True) + actual_risk, predicted_risk = result_df.iloc[:, 0], result_df.iloc[:, 1] + corr = pearsonr(actual_risk, predicted_risk)[0] + print('The correlation between predicted risk and actual risk is: {:.6f}'.format(corr)) + + # visualized results + fig, axes = plt.subplots(2, 2, figsize=(15, 10)) + sns.histplot(actual_risk, ax=axes[0, 0]) + axes[0, 0].set_title('Market: {} Actual Risk'.format(market)) + axes[0, 0].grid() + + sns.histplot(predicted_risk, ax=axes[0, 1]) + axes[0, 1].set_title('Feature: {} Predicted Risk'.format(task['dataset']['kwargs']['handler']['class'])) + axes[0, 1].grid() + + sns.scatterplot(data=result_df, ax=axes[1, 0], x='Actual Risk', y='Predicted Risk', s=20) + axes[1, 0].set_title('Market: {} Feature: {} Corr: {:.5f}'.format( + market, task['dataset']['kwargs']['handler']['class'], corr)) + axes[1, 0].grid() + + matrix, ax_labels = heatmap(actual_risk, predicted_risk) + sns.heatmap(matrix, annot=True, fmt=".3f", xticklabels=ax_labels, yticklabels=ax_labels, ax=axes[1, 1], + ) + axes[1, 1].set_xlabel('Predicted Alarm Threshold') + axes[1, 1].set_ylabel('Actual Positive Threshold') + axes[1, 1].set_title('Risk Prediction Precision Heatmap') + plt.show() From ce60097722d46bb431224d1bf9e67e59b2e03224 Mon Sep 17 00:00:00 2001 From: meng-ustc Date: Wed, 24 Feb 2021 16:59:31 +0900 Subject: [PATCH 08/11] Add README and Formatted --- README.md | 1 + examples/benchmarks/DoubleEnsemble/README.md | 4 + qlib/contrib/model/double_ensemble.py | 77 ++++++++++---------- 3 files changed, 45 insertions(+), 37 deletions(-) create mode 100644 examples/benchmarks/DoubleEnsemble/README.md diff --git a/README.md b/README.md index 787075d6ad..e1608c37da 100644 --- a/README.md +++ b/README.md @@ -232,6 +232,7 @@ Here is a list of models built on `Qlib`. - [SFM based on pytorch (Liheng Zhang, et al. 2017)](qlib/contrib/model/pytorch_sfm.py) - [TFT based on tensorflow (Bryan Lim, et al. 2019)](examples/benchmarks/TFT/tft.py) - [TabNet based on pytorch (Sercan O. Arik, et al. 2019)](qlib/contrib/model/pytorch_tabnet.py) +- [DoubleEnsemble based on LightGBM (Chuheng Zhang, et al. 2020)](qlib/contrib/model/double_ensemble.py) Your PR of new Quant models is highly welcomed. diff --git a/examples/benchmarks/DoubleEnsemble/README.md b/examples/benchmarks/DoubleEnsemble/README.md new file mode 100644 index 0000000000..67e741050f --- /dev/null +++ b/examples/benchmarks/DoubleEnsemble/README.md @@ -0,0 +1,4 @@ +# DoubleEnsemble +* DoubleEnsemble is an ensemble framework leveraging learning trajectory based sample reweighting and shuffling based feature selection, to solve both the low signal-to-noise ratio and increasing number of features problems. They identify the key samples based on the training dynamics on each sample and elicit key features based on the ablation impact of each feature via shuffling. The model is applicable to a wide range of base models, capable of extracting complex patterns, while mitigating the overfitting and instability issues for financial market prediction. +* This code used in Qlib is implemented by ourselves. +* Paper: DoubleEnsemble: A New Ensemble Method Based on Sample Reweighting and Feature Selection for Financial Data Analysis [https://arxiv.org/pdf/2010.01265.pdf](https://arxiv.org/pdf/2010.01265.pdf). \ No newline at end of file diff --git a/qlib/contrib/model/double_ensemble.py b/qlib/contrib/model/double_ensemble.py index 786b3987cf..c96b69e8b4 100644 --- a/qlib/contrib/model/double_ensemble.py +++ b/qlib/contrib/model/double_ensemble.py @@ -15,21 +15,22 @@ class DEnsembleModel(Model): """Double Ensemble Model""" def __init__( - self, - base="gbm", - loss="mse", - k=6, - enable_sr=True, - enable_fs=True, - alpha1=1., - alpha2=1., - bins_sr=10, - bins_fs=5, - decay=None, - sample_ratios=None, - sub_weights=None, - epochs=100, - **kwargs): + self, + base="gbm", + loss="mse", + k=6, + enable_sr=True, + enable_fs=True, + alpha1=1.0, + alpha2=1.0, + bins_sr=10, + bins_fs=5, + decay=None, + sample_ratios=None, + sub_weights=None, + epochs=100, + **kwargs + ): self.base = base # "gbm" or "mlp", specifically, we use lgbm for "gbm" self.k = k self.enable_sr = enable_sr @@ -54,10 +55,7 @@ def __init__( self.params.update(kwargs) self.loss = loss - def fit( - self, - dataset: DatasetH - ): + def fit(self, dataset: DatasetH): df_train, df_valid = dataset.prepare( ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L ) @@ -71,7 +69,7 @@ def fit( # train k sub-models for i_k in range(self.k): self.sub_features.append(features) - self.logger.info("Training sub-model: ({}/{})".format(i_k+1, self.k)) + self.logger.info("Training sub-model: ({}/{})".format(i_k + 1, self.k)) model_k = self.train_submodel(df_train, df_valid, weights, features) self.ensemble.append(model_k) # no further sample re-weight and feature selection needed for the last sub-model @@ -82,12 +80,12 @@ def fit( loss_curve = self.retrieve_loss_curve(model_k, df_train, features) pred_k = self.predict_sub(model_k, df_train, features) pred_sub.iloc[:, i_k] = pred_k - pred_ensemble = pred_sub.iloc[:, :i_k+1].mean(axis=1) + pred_ensemble = pred_sub.iloc[:, : i_k + 1].mean(axis=1) loss_values = pd.Series(self.get_loss(y_train.values.squeeze(), pred_ensemble.values)) if self.enable_sr: self.logger.info("Sample re-weighting...") - weights = self.sample_reweight(loss_curve, loss_values, i_k+1) + weights = self.sample_reweight(loss_curve, loss_values, i_k + 1) if self.enable_fs: self.logger.info("Feature selection...") @@ -148,14 +146,14 @@ def sample_reweight(self, loss_curve, loss_values, k_th): # calculate h-value for each sample h1 = loss_values_norm h2 = (l_end / l_start).rank(pct=True) - h = pd.DataFrame({'h_value': self.alpha1 * h1 + self.alpha2 * h2}) + h = pd.DataFrame({"h_value": self.alpha1 * h1 + self.alpha2 * h2}) # calculate weights - h['bins'] = pd.cut(h['h_value'], self.bins_sr) - h_avg = h.groupby('bins')['h_value'].mean() + h["bins"] = pd.cut(h["h_value"], self.bins_sr) + h_avg = h.groupby("bins")["h_value"].mean() weights = pd.Series(np.zeros(N, dtype=float)) for i_b, b in enumerate(h_avg.index): - weights[h['bins'] == b] = 1. / (self.decay ** k_th * h_avg[i_b] + 0.1) + weights[h["bins"] == b] = 1.0 / (self.decay ** k_th * h_avg[i_b] + 0.1) return weights def feature_selection(self, df_train, loss_values): @@ -170,7 +168,7 @@ def feature_selection(self, df_train, loss_values): x_train, y_train = df_train["feature"], df_train["label"] features = x_train.columns N, F = x_train.shape - g = pd.DataFrame({'g_value': np.zeros(F, dtype=float)}) + g = pd.DataFrame({"g_value": np.zeros(F, dtype=float)}) M = len(self.ensemble) # shuffle specific columns and calculate g-value for each feature @@ -179,23 +177,27 @@ def feature_selection(self, df_train, loss_values): x_train_tmp.loc[:, feat] = np.random.permutation(x_train_tmp.loc[:, feat].values) pred = pd.Series(np.zeros(N), index=x_train_tmp.index) for i_s, submodel in enumerate(self.ensemble): - pred += pd.Series(submodel.predict(x_train_tmp.loc[:, self.sub_features[i_s]].values), - index=x_train_tmp.index) / M + pred += ( + pd.Series( + submodel.predict(x_train_tmp.loc[:, self.sub_features[i_s]].values), index=x_train_tmp.index + ) + / M + ) loss_feat = self.get_loss(y_train.values.squeeze(), pred.values) - g.loc[i_f, 'g_value'] = np.mean(loss_feat - loss_values) / np.std(loss_feat - loss_values) + g.loc[i_f, "g_value"] = np.mean(loss_feat - loss_values) / np.std(loss_feat - loss_values) x_train_tmp.loc[:, feat] = x_train.loc[:, feat].copy() # one column in train features is all-nan # if g['g_value'].isna().any() - g['g_value'].replace(np.nan, 0, inplace=True) + g["g_value"].replace(np.nan, 0, inplace=True) # divide features into bins_fs bins - g['bins'] = pd.cut(g['g_value'], self.bins_fs) + g["bins"] = pd.cut(g["g_value"], self.bins_fs) # randomly sample features from bins to construct the new features res_feat = [] - sorted_bins = sorted(g['bins'].unique(), reverse=True) + sorted_bins = sorted(g["bins"].unique(), reverse=True) for i_b, b in enumerate(sorted_bins): - b_feat = features[g['bins'] == b] + b_feat = features[g["bins"] == b] num_feat = int(np.ceil(self.sample_ratios[i_b] * len(b_feat))) res_feat = res_feat + np.random.choice(b_feat, size=num_feat).tolist() return pd.Index(res_feat) @@ -233,12 +235,13 @@ def predict(self, dataset): pred = pd.Series(np.zeros(x_test.shape[0]), index=x_test.index) for i_sub, submodel in enumerate(self.ensemble): feat_sub = self.sub_features[i_sub] - pred += pd.Series(submodel.predict(x_test.loc[:, feat_sub].values), index=x_test.index) * self.sub_weights[i_sub] + pred += ( + pd.Series(submodel.predict(x_test.loc[:, feat_sub].values), index=x_test.index) + * self.sub_weights[i_sub] + ) return pred def predict_sub(self, submodel, df_data, features): x_data, y_data = df_data["feature"].loc[:, features], df_data["label"] pred_sub = pd.Series(submodel.predict(x_data.values), index=x_data.index) return pred_sub - - From 70575e8a1ce4f63cced98a55de1914009012bdcd Mon Sep 17 00:00:00 2001 From: Meng Dong Date: Wed, 24 Feb 2021 16:10:38 +0800 Subject: [PATCH 09/11] Delete workflow_by_code_lgb_risk_demo.py --- examples/workflow_by_code_lgb_risk_demo.py | 179 --------------------- 1 file changed, 179 deletions(-) delete mode 100644 examples/workflow_by_code_lgb_risk_demo.py diff --git a/examples/workflow_by_code_lgb_risk_demo.py b/examples/workflow_by_code_lgb_risk_demo.py deleted file mode 100644 index b250993d33..0000000000 --- a/examples/workflow_by_code_lgb_risk_demo.py +++ /dev/null @@ -1,179 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import sys -from pathlib import Path - -import qlib -from qlib.config import REG_CN -from qlib.utils import exists_qlib_data, init_instance_by_config, flatten_dict -from qlib.workflow import R -from qlib.workflow.record_temp import SignalRecord, PortAnaRecord -from qlib.data.dataset.handler import DataHandlerLP - -import seaborn as sns -import matplotlib.pyplot as plt -import math -import pandas as pd -from scipy.stats.stats import pearsonr -import numpy as np - -if __name__ == "__main__": - - # use default data - provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir - if not exists_qlib_data(provider_uri): - print(f"Qlib data is not found in {provider_uri}") - sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) - from get_data import GetData - - GetData().qlib_data(target_dir=provider_uri, region=REG_CN) - - qlib.init(provider_uri=provider_uri, region=REG_CN) - - market = "csi300" - benchmark = "SH000300" - - ################################### - # train model - ################################### - data_handler_config = { - "start_time": "2008-01-01", - "end_time": "2020-08-01", - "fit_start_time": "2008-01-01", - "fit_end_time": "2014-12-31", - "instruments": market, - "infer_processors": [ - {"class": "ProcessInf", "kwargs": {}}, - {"class": "ZScoreNorm", "kwargs": {"fields_group": "feature"}}, - {"class": "Fillna", "kwargs": {}}, - ], - "learn_processors": [{ - "class": "DropnaLabel", }, - ], - "label": (["Ref(Min($low, 5), -4)/$close - 1"], ["LABEL0"]) # the period for risk prediction is 5 days - } - - task = { - "model": { - "class": "LGBModel", - "module_path": "qlib.contrib.model.gbdt", - "kwargs": { - "loss": "mse", - "colsample_bytree": 0.8999, - "learning_rate": 0.02, - "subsample": 0.7, - "lambda_l1": 11.9668, - "lambda_l2": 339.1301, - "max_depth": 16, - "num_leaves": 31, - "num_threads": 20, - }, - }, - "dataset": { - "class": "DatasetH", - "module_path": "qlib.data.dataset", - "kwargs": { - "handler": { - "class": "Alpha360", - "module_path": "qlib.contrib.data.handler", - "kwargs": data_handler_config, - }, - "segments": { - "train": ("2008-01-01", "2014-12-31"), - "valid": ("2015-01-01", "2016-12-31"), - "test": ("2017-01-01", "2020-08-01"), - }, - }, - }, - } - - port_analysis_config = { - "strategy": { - "class": "TopkDropoutStrategy", - "module_path": "qlib.contrib.strategy.strategy", - "kwargs": { - "topk": 50, - "n_drop": 5, - }, - }, - "backtest": { - "verbose": False, - "limit_threshold": 0.095, - "account": 100000000, - "benchmark": benchmark, - "deal_price": "close", - "open_cost": 0.0005, - "close_cost": 0.0015, - "min_cost": 5, - "return_order": True, - }, - } - - # model initiaiton - model = init_instance_by_config(task["model"]) - dataset = init_instance_by_config(task["dataset"]) - - # NOTE: This line is optional - # It demonstrates that the dataset can be used standalone. - example_df = dataset.prepare("train") - print(example_df.head()) - - def heatmap(actual_risk, predicted_risk, step=0.02): - """ - plot the precision heatmap as a visualized evaluation for risk predition - :param actual_risk: the LABEL0 of test samples - :param predicted_risk: the predicted results of test samples - :param step: the internal size of risk values on axis - :return: - """ - num_step = math.ceil(-predicted_risk.min() / step) - matrix = np.zeros((num_step, num_step), dtype=np.float) - for pred_thresh in range(num_step): - for act_thresh in range(num_step): - actual_positive = actual_risk < -act_thresh*step - predicted_alarm = predicted_risk < -pred_thresh*step - num_alarm = predicted_alarm.sum() - num_tp = (actual_positive & predicted_alarm).sum() - matrix[pred_thresh, act_thresh] = num_tp / num_alarm - axis_labels = ['{:.3f}'.format(-x * step) for x in range(num_step)] - return matrix, axis_labels - - # start exp - with R.start(experiment_name="workflow"): - R.log_params(**flatten_dict(task)) - model.fit(dataset) - - # prediction - actual_risk = dataset.prepare("test", col_set="label", data_key=DataHandlerLP.DK_I)['LABEL0'] - pred = model.predict(dataset) - - result_df = pd.concat((actual_risk, pred), axis=1) - result_df.columns = ['Actual Risk', 'Predicted Risk'] - result_df.dropna(inplace=True) - actual_risk, predicted_risk = result_df.iloc[:, 0], result_df.iloc[:, 1] - corr = pearsonr(actual_risk, predicted_risk)[0] - print('The correlation between predicted risk and actual risk is: {:.6f}'.format(corr)) - - # visualized results - fig, axes = plt.subplots(2, 2, figsize=(15, 10)) - sns.histplot(actual_risk, ax=axes[0, 0]) - axes[0, 0].set_title('Market: {} Actual Risk'.format(market)) - axes[0, 0].grid() - - sns.histplot(predicted_risk, ax=axes[0, 1]) - axes[0, 1].set_title('Feature: {} Predicted Risk'.format(task['dataset']['kwargs']['handler']['class'])) - axes[0, 1].grid() - - sns.scatterplot(data=result_df, ax=axes[1, 0], x='Actual Risk', y='Predicted Risk', s=20) - axes[1, 0].set_title('Market: {} Feature: {} Corr: {:.5f}'.format( - market, task['dataset']['kwargs']['handler']['class'], corr)) - axes[1, 0].grid() - - matrix, ax_labels = heatmap(actual_risk, predicted_risk) - sns.heatmap(matrix, annot=True, fmt=".3f", xticklabels=ax_labels, yticklabels=ax_labels, ax=axes[1, 1], - ) - axes[1, 1].set_xlabel('Predicted Alarm Threshold') - axes[1, 1].set_ylabel('Actual Positive Threshold') - axes[1, 1].set_title('Risk Prediction Precision Heatmap') - plt.show() From 6e2ce6f1dce676f1c82bed2446e84ba43e4bc29b Mon Sep 17 00:00:00 2001 From: meng-ustc Date: Tue, 2 Mar 2021 12:17:05 +0900 Subject: [PATCH 10/11] Add the results of DoubleEnsemble --- examples/benchmarks/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/benchmarks/README.md b/examples/benchmarks/README.md index bcd2279b33..c5bfced6dc 100644 --- a/examples/benchmarks/README.md +++ b/examples/benchmarks/README.md @@ -16,7 +16,7 @@ The numbers shown below demonstrate the performance of the entire `workflow` of | LSTM (Sepp Hochreiter, et al.) | Alpha360 | 0.0443±0.01 | 0.3401±0.05| 0.0536±0.01 | 0.4248±0.05 | 0.0627±0.03 | 0.8441±0.48| -0.0882±0.03 | | ALSTM (Yao Qin, et al.) | Alpha360 | 0.0493±0.01 | 0.3778±0.06| 0.0585±0.00 | 0.4606±0.04 | 0.0513±0.03 | 0.6727±0.38| -0.1085±0.02 | | GATs (Petar Velickovic, et al.) | Alpha360 | 0.0475±0.00 | 0.3515±0.02| 0.0592±0.00 | 0.4585±0.01 | 0.0876±0.02 | 1.1513±0.27| -0.0795±0.02 | - +| DoubleEnsemble (Chuheng Zhang, et al.) | Alpha360 | 0.0407±0.00| 0.3053±0.00 | 0.0490±0.00 | 0.3840±0.00 | 0.0380±0.02 | 0.5000±0.21 | -0.0984±0.02 | ## Alpha158 dataset | Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown | |---|---|---|---|---|---|---|---|---| @@ -31,5 +31,7 @@ The numbers shown below demonstrate the performance of the entire `workflow` of | LSTM (Sepp Hochreiter, et al.) | Alpha158 (with selected 20 features) | 0.0312±0.00 | 0.2394±0.04| 0.0418±0.00 | 0.3324±0.03 | 0.0298±0.02 | 0.4198±0.33| -0.1348±0.03 | | ALSTM (Yao Qin, et al.) | Alpha158 (with selected 20 features) | 0.0385±0.01 | 0.3022±0.06| 0.0478±0.00 | 0.3874±0.04 | 0.0486±0.03 | 0.7141±0.45| -0.1088±0.03 | | GATs (Petar Velickovic, et al.) | Alpha158 (with selected 20 features) | 0.0349±0.00 | 0.2511±0.01| 0.0457±0.00 | 0.3537±0.01 | 0.0578±0.02 | 0.8221±0.25| -0.0824±0.02 | +| DoubleEnsemble (Chuheng Zhang, et al.) | Alpha158 | 0.0544±0.00 | 0.4338±0.01 | 0.0523±0.00 | 0.4257±0.01 | 0.1253±0.01 | 1.4105±0.14 | -0.0902±0.01 | - The selected 20 features are based on the feature importance of a lightgbm-based model. +- The base model of DoubleEnsemble is LGBM. From 1de4def444a44247aeb80db1adadb8fcad5beb39 Mon Sep 17 00:00:00 2001 From: meng-ustc Date: Tue, 2 Mar 2021 16:14:56 +0900 Subject: [PATCH 11/11] Update parameter names: 'k' and 'base' --- ...rkflow_config_doubleensemble_Alpha158.yaml | 4 +-- ...rkflow_config_doubleensemble_Alpha360.yaml | 4 +-- qlib/contrib/model/double_ensemble.py | 30 +++++++++---------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml index 74923c0e5e..a12df802da 100644 --- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml +++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml @@ -30,9 +30,9 @@ task: class: DEnsembleModel module_path: qlib.contrib.model.double_ensemble kwargs: - base: "gbm" + base_model: "gbm" loss: mse - k: 6 + num_models: 6 enable_sr: True enable_fs: True alpha1: 1 diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml index 3351cefc5a..415448f0be 100644 --- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml +++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml @@ -37,9 +37,9 @@ task: class: DEnsembleModel module_path: qlib.contrib.model.double_ensemble kwargs: - base: "gbm" + base_model: "gbm" loss: mse - k: 6 + num_models: 6 enable_sr: True enable_fs: True alpha1: 1 diff --git a/qlib/contrib/model/double_ensemble.py b/qlib/contrib/model/double_ensemble.py index c96b69e8b4..a340489c20 100644 --- a/qlib/contrib/model/double_ensemble.py +++ b/qlib/contrib/model/double_ensemble.py @@ -16,9 +16,9 @@ class DEnsembleModel(Model): def __init__( self, - base="gbm", + base_model="gbm", loss="mse", - k=6, + num_models=6, enable_sr=True, enable_fs=True, alpha1=1.0, @@ -31,8 +31,8 @@ def __init__( epochs=100, **kwargs ): - self.base = base # "gbm" or "mlp", specifically, we use lgbm for "gbm" - self.k = k + self.base_model = base_model # "gbm" or "mlp", specifically, we use lgbm for "gbm" + self.num_models = num_models # the number of sub-models self.enable_sr = enable_sr self.enable_fs = enable_fs self.alpha1 = alpha1 @@ -43,8 +43,8 @@ def __init__( if not len(sample_ratios) == bins_fs: raise ValueError("The length of sample_ratios should be equal to bins_fs.") self.sample_ratios = sample_ratios - if not len(sub_weights) == k: - raise ValueError("The length of sub_weights should be equal to k.") + if not len(sub_weights) == num_models: + raise ValueError("The length of sub_weights should be equal to num_models.") self.sub_weights = sub_weights self.epochs = epochs self.logger = get_module_logger("DEnsembleModel") @@ -65,27 +65,27 @@ def fit(self, dataset: DatasetH): weights = pd.Series(np.ones(N, dtype=float)) # initialize the features features = x_train.columns - pred_sub = pd.DataFrame(np.zeros((N, self.k), dtype=float), index=x_train.index) - # train k sub-models - for i_k in range(self.k): + pred_sub = pd.DataFrame(np.zeros((N, self.num_models), dtype=float), index=x_train.index) + # train sub-models + for k in range(self.num_models): self.sub_features.append(features) - self.logger.info("Training sub-model: ({}/{})".format(i_k + 1, self.k)) + self.logger.info("Training sub-model: ({}/{})".format(k + 1, self.num_models)) model_k = self.train_submodel(df_train, df_valid, weights, features) self.ensemble.append(model_k) # no further sample re-weight and feature selection needed for the last sub-model - if i_k + 1 == self.k: + if k + 1 == self.num_models: break self.logger.info("Retrieving loss curve and loss values...") loss_curve = self.retrieve_loss_curve(model_k, df_train, features) pred_k = self.predict_sub(model_k, df_train, features) - pred_sub.iloc[:, i_k] = pred_k - pred_ensemble = pred_sub.iloc[:, : i_k + 1].mean(axis=1) + pred_sub.iloc[:, k] = pred_k + pred_ensemble = pred_sub.iloc[:, : k + 1].mean(axis=1) loss_values = pd.Series(self.get_loss(y_train.values.squeeze(), pred_ensemble.values)) if self.enable_sr: self.logger.info("Sample re-weighting...") - weights = self.sample_reweight(loss_curve, loss_values, i_k + 1) + weights = self.sample_reweight(loss_curve, loss_values, k + 1) if self.enable_fs: self.logger.info("Feature selection...") @@ -209,7 +209,7 @@ def get_loss(self, label, pred): raise ValueError("not implemented yet") def retrieve_loss_curve(self, model, df_train, features): - if self.base == "gbm": + if self.base_model == "gbm": num_trees = model.num_trees() x_train, y_train = df_train["feature"].loc[:, features], df_train["label"] # Lightgbm need 1D array as its label